elismasilva commited on 4 days ago

Commit

9c451a3

1 Parent(s): dc5e848

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +15 -0
.gitignore +20 -0
README.md +171 -0
assets/DMDR.webp +3 -0
assets/Z-Image-Gallery.pdf +3 -0
assets/architecture.webp +3 -0
assets/bottle.jpg +3 -0
assets/canny.jpg +3 -0
assets/decoupled-dmd.webp +3 -0
assets/depth.jpg +3 -0
assets/depth_cat.png +3 -0
assets/hed.jpg +3 -0
assets/inpaint.jpg +3 -0
assets/leaderboard.png +3 -0
assets/leaderboard.webp +0 -0
assets/man_hed.png +3 -0
assets/mask.jpg +3 -0
assets/mask_inpaint.jpg +3 -0
assets/pose.jpg +3 -0
assets/pose2.jpg +3 -0
assets/pose3.jpg +3 -0
assets/pose4.png +3 -0
assets/reasoning.png +3 -0
assets/room_mlsd.png +3 -0
assets/showcase.jpg +3 -0
assets/showcase_editing.png +3 -0
assets/showcase_realistic.png +3 -0
assets/showcase_rendering.png +3 -0
diffusers_local/__init__.py +7 -0
diffusers_local/patch.py +509 -0
diffusers_local/pipeline_z_image_control_unified.py +910 -0
diffusers_local/z_image_control_transformer_2d.py +1443 -0
infer_controlnet.py +146 -0
infer_i2i.py +94 -0
infer_inpaint.py +109 -0
infer_t2i.py +89 -0
model_index.json +24 -0
requirements.txt +22 -0
results/canny.png +3 -0
results/depth.png +3 -0
results/hed.png +3 -0
results/new_tests/controlnet_result_i2i.png +3 -0
results/new_tests/result_control_canny.png +3 -0
results/new_tests/result_control_depth.png +3 -0
results/new_tests/result_control_hed.png +3 -0
results/new_tests/result_control_inpaint_original_mask.png +3 -0
results/new_tests/result_control_mlsd.png +3 -0
results/new_tests/result_control_pose.png +3 -0
results/new_tests/result_inpaint.png +3 -0
results/new_tests/result_t2i.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/architecture.webp filter=lfs diff=lfs merge=lfs -text
+assets/decoupled-dmd.webp filter=lfs diff=lfs merge=lfs -text
+assets/DMDR.webp filter=lfs diff=lfs merge=lfs -text
+assets/leaderboard.png filter=lfs diff=lfs merge=lfs -text
+assets/reasoning.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase_editing.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase_realistic.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase_rendering.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase.jpg filter=lfs diff=lfs merge=lfs -text
+assets/Z-Image-Gallery.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+assets/*.png filter=lfs diff=lfs merge=lfs -text
+assets/*.jpg filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+transformer/z_image_turbo_control_unified_v2.1_q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+__pycache__/
+*.py[cod]
+local_tests/
+/.vs
+.vscode/
+.ruff_cache/
+.idea/
+models/
+venv/
+models/
+.venv/
+*.log
+.DS_Store
+.gradio
+download.py
+bk/
+outputs/
+original/
+Makefile
+pyproject.toml

README.md CHANGED Viewed

@@ -1,3 +1,174 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+tags:
+- text-to-image
+- image-to-image
+- inpainting
+- controlnet
+- diffusers
+- gguf
+- z-image-turbo
+pipeline_tag: text-to-image
 ---
+# Z-Image Turbo Control Unified V2 (V2.1)
+[![Github](https://img.shields.io/badge/🎬%20Code-VideoX_Fun-blue)](https://github.com/aigc-apps/VideoX-Fun)
+[![Original Repo](https://img.shields.io/badge/%F0%9F%A4%97%20Checkpoint-Original--Repo-yellow)](https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.1)
+This repository hosts the **Z-Image Turbo Control Unified V2** model. This is a specialized architecture that unifies the powerful **Z-Image Turbo** base transformer with enhanced **ControlNet** capabilities into a single, cohesive model. This unified pipeline supports multiple generation modes in one place: **Text-to-Image, Image-to-Image, ControlNet, and Inpainting**.
+Unlike traditional pipelines where ControlNet is an external add-on, this model integrates control layers directly into the transformer structure. This enables **Unified GGUF Quantization**, allowing the entire merged architecture (Base + Control) to be quantized (e.g., Q4_K_M) and run efficiently on consumer hardware with limited VRAM. This version also introduces significant optimizations, architectural improvements, and bug fixes for features like `group_offload`.
+## 📥 Installation
+To set up the environment, simply install the dependencies:
+```bash
+#create virtual env
+python -m venv venv
+# Activate your venv
+#upgrade pip
+python.exe -m pip install --upgrade pip
+#install requirements
+pip install -r requirements.txt
+```
+*Note: This repository contains a `diffusers_local` folder with the custom `ZImageControlUnifiedPipeline` and transformer logic required to run this specific architecture.*
+## 🚀 Usage
+## 📂 Repository Structure
+*   `./transformer/z_image_turbo_control_unified_v2.1_q4_k_m.gguf`: The unified, quantized model weights.
+*   `infer_controlnet.py`: Script for running controlnet inference.
+*   `infer_inpaint.py`: Script for running inpaint inference.
+*   `infer_t2i.py`: Script for running text-to-image inference.
+*   `infer_i2i.py`: Script for running image-to-image inference.
+*   `diffusers_local/`: Custom pipeline code (`ZImageControlUnifiedPipeline`) and transformer logic.
+*   `requirements.txt`: Python dependencies.
+The primary script for inference is `infer_controlnet.py`, which is designed to handle all supported generation modes.
+### Option 1: Low VRAM (GGUF) - Recommended
+Use this version if you have limited VRAM (e.g., 6GB - 8GB). It loads the model from a quantized **GGUF** file (`z_image_turbo_control_unified_v2.1_q4_k_m.gguf`). Simply configure the `infer_controlnet.py` script to point to the GGUF file.
+**Key Features of this mode:**
+*   Loads the unified transformer from a single 4-bit quantized file.
+*   Enables aggressive `group_offload` to fit large models in consumer GPUs.
+### Option 2: High Precision (Diffusers/BF16)
+Use this version if you have ample VRAM (e.g., 24GB+). Configure `infer_controlnet.py` to load the model using the standard `from_pretrained` directory structure for full **BFloat16** precision.
+## 🛠️ Model Features & Configuration (V2)
+### Original Features
+- This ControlNet is added on 15 layer blocks and 2 refiner layer blocks.
+- The model was trained from scratch for 70,000 steps on a dataset of 1 million high-quality images.
+- Multiple Control Conditions Supports Canny, HED, Depth, Pose, and MLSD, which can be used like a standard ControlNet.
+- You can adjust `controlnet_conditioning_scale` for stronger control. For better stability, we highly recommend using a detailed prompt. The optimal range for `controlnet_conditioning_scale` is from 0.65 to 0.90.
+  - **Note on Steps: As you increase the control strength, it's recommended to appropriately increase the number of inference steps to achieve better results.**
+This optmized V2 model introduces several new features and parameters for enhanced control and flexibility:
+*   **Unified Pipeline:** A single pipeline now handles Text-to-Image, Image-to-Image, ControlNet, and Inpainting tasks.
+*   **Refiner Scale (`controlnet_refiner_conditioning_scale`):** It provides fine-grained control over the influence of the initial refining layers, allowing for isolated adjustments without the influence of the controlnet's conditioning force.
+*   **Optional Refiner (`add_control_noise_refiner=False`):** You can now disable the control noise refiner layers when loading the model to save memory or for different stylistic results.
+*   **Inpainting Blur (`mask_blur_radius`):** A parameter to soften the edges of the inpainting mask for smoother transitions.
+*   **Backward Compatibility:** The model supports running weights from V1.
+*   **Group Offload Fixes:** The underlying code includes crucial fixes to ensure diffusers `group_offload` works correctly with `use_stream=True`, enabling efficient memory management without errors.
+## 🏞️ V2 Examples: Refiner Scale Test
+The new `controlnet_refiner_conditioning_scale` parameter allows for fine-tuning the control signal. Here is a comparison showing its effect while keeping the main control scale fixed.
+**Prompt:** "Photorealistic portrait of a beautiful young East Asian woman with long, vibrant purple hair and a black bow. She is wearing a flowing white summer dress, standing on a sunny beach with a sparkling ocean and clear blue sky in the background. Bright natural sunlight, sharp focus, ultra-detailed."
+**Control Image:** Pose.
+| `controlnet_conditioning_scale=0.75, num_steps=25` | Refiner: Off | Refiner Scale: 0.75 | Refiner Scale: 1.0 | Refiner Scale: 1.5 | Refiner Scale: 2.0 |
+|:---:|:---:|:---:|:---:|:---:|:---:|
+| **Output** | ![](results/refiner_scale_test/result_control_pose_0.75_off.png) | ![](results/refiner_scale_test/result_control_pose_0.75_0.75.png) | ![](results/refiner_scale_test/result_control_pose_0.75_1.0.png) | ![](results/refiner_scale_test/result_control_pose_0.75_1.5.png) | ![](results/refiner_scale_test/result_control_pose_0.75_2.0.png) |
+---
+### New Tests with this pipeline
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>Pose + Inpaint</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/inpaint.jpg" width="50%" /><img src="assets/mask_inpaint.jpg" width="50%" /></td>
+    <td><img src="results/new_tests/result_inpaint.png" width="50%" /></td>
+  </tr>
+</table>
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>Pose</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/pose.jpg" width="85%" /></td>
+    <td><img src="results/new_tests/result_control_pose.png" width="50%" /></td>
+  </tr>
+</table>
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>Canny</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/canny.jpg" width="50%" /></td>
+    <td><img src="results/new_tests/result_control_canny.png" width="50%" /></td>
+  </tr>
+</table>
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>HED</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/man_hed.png" width="50%" /></td>
+    <td><img src="results/new_tests/result_control_hed.png" width="50%" /></td>
+  </tr>
+</table>
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>Depth</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/depth_cat.png" width="50%" /></td>
+    <td><img src="results/new_tests/result_control_depth.png" width="50%" /></td>
+  </tr>
+</table>
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+    <td>MLSD</td>
+    <td>Output</td>
+  </tr>
+  <tr>
+    <td><img src="assets/room_mlsd.png" width="100%" /></td>
+    <td><img src="results/new_tests/result_control_mlsd.png" width="50%" /></td>
+  </tr>
+</table>
+## Original V2 Model Results
+This section includes examples from the original model for reference. The V2 model is capable of producing these results and more.
+### Original Scale Test Results
+The table below shows the generation results under different combinations of Diffusion steps and Control Scale strength from the original model:
+| Diffusion Steps | Scale 0.65 | Scale 0.70 | Scale 0.75 | Scale 0.8 | Scale 0.9 | Scale 1.0 |
+|:---------------:|:----------:|:----------:|:----------:|:---------:|:---------:|:---------:|
+| **9** | ![](results/scale_test/9_scale_0.65.png) | ![](results/scale_test/9_scale_0.70.png) | ![](results/scale_test/9_scale_0.75.png) | ![](results/scale_test/9_scale_0.8.png) | ![](results/scale_test/9_scale_0.9.png) | ![](results/scale_test/9_scale_1.0.png) |
+| **10** | ![](results/scale_test/10_scale_0.65.png) | ![](results/scale_test/10_scale_0.70.png) | ![](results/scale_test/10_scale_0.75.png) | ![](results/scale_test/10_scale_0.8.png) | ![](results/scale_test/10_scale_0.9.png) | ![](results/scale_test/10_scale_1.0.png) |
+| **20** | ![](results/scale_test/20_scale_0.65.png) | ![](results/scale_test/20_scale_0.70.png) | ![](results/scale_test/20_scale_0.75.png) | ![](results/scale_test/20_scale_0.8.png) | ![](results/scale_test/20_scale_0.9.png) | ![](results/scale_test/20_scale_1.0.png) |
+| **30** | ![](results/scale_test/30_scale_0.65.png) | ![](results/scale_test/30_scale_0.70.png) | ![](results/scale_test/30_scale_0.75.png) | ![](results/scale_test/30_scale_0.8.png) | ![](results/scale_test/30_scale_0.9.png) | ![](results/scale_test/30_scale_1.0.png) |
+| **40** | ![](results/scale_test/40_scale_0.65.png) | ![](results/scale_test/40_scale_0.70.png) | ![](results/scale_test/40_scale_0.75.png) | ![](results/scale_test/40_scale_0.8.png) | ![](results/scale_test/40_scale_0.9.png) | ![](results/scale_test/40_scale_1.0.png) |

assets/DMDR.webp ADDED Viewed

Git LFS Details

SHA256: 2e6f3053b98d097f2aa11d3892bd9307326db41b65336bea54dc5825a0e03077
Pointer size: 131 Bytes
Size of remote file: 173 kB

assets/Z-Image-Gallery.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f9895b3246d2547bac74bbe0be975da500eaae93f2cad4248ad3281786b1ac6
+size 15767436

assets/architecture.webp ADDED Viewed

Git LFS Details

SHA256: 261af62ecc7e9749ae28e1d3a84e2f70a6c192d2017b7d8f020c7bff982ef59c
Pointer size: 131 Bytes
Size of remote file: 422 kB

assets/bottle.jpg ADDED Viewed

Git LFS Details

SHA256: a17f1b83e59a150b64cff33f191407ee24ce340220aca8261d2ed0894fadb0e6
Pointer size: 130 Bytes
Size of remote file: 73.1 kB

assets/canny.jpg ADDED Viewed

Git LFS Details

SHA256: 800790ae2e890e99b75dc1fc0a05142d22dbcdd9a961d2bc15222a4356683723
Pointer size: 131 Bytes
Size of remote file: 278 kB

assets/decoupled-dmd.webp ADDED Viewed

Git LFS Details

SHA256: 4568ca559b997fc38f57dc1c3f5b1da3a3c144ae12419caa855ced972bf8c7aa
Pointer size: 131 Bytes
Size of remote file: 152 kB

assets/depth.jpg ADDED Viewed

Git LFS Details

SHA256: 6e2ba1022bb71d026c764b12e7d6c67a233cfa4c6836616f618a878764fe7a7c
Pointer size: 131 Bytes
Size of remote file: 106 kB

assets/depth_cat.png ADDED Viewed

Git LFS Details

SHA256: 2088b40e100f912183036763a08de02b62e7ea26bc413448f15977452a7dd0b2
Pointer size: 131 Bytes
Size of remote file: 294 kB

assets/hed.jpg ADDED Viewed

Git LFS Details

SHA256: c10f91fe342b439d1e99fe703e313aa09315b59cf7362c43e2e42910f7c681d7
Pointer size: 131 Bytes
Size of remote file: 188 kB

assets/inpaint.jpg ADDED Viewed

Git LFS Details

SHA256: 05cae403843d306d59d43854d04abeb830fd6fd66b7898b52ef94ee4f5fc849b
Pointer size: 131 Bytes
Size of remote file: 583 kB

assets/leaderboard.png ADDED Viewed

Git LFS Details

SHA256: e9fd4aa185bb7bff2b5515f2001b4d80df330595e78d6a098142e5a232bb4e4e
Pointer size: 132 Bytes
Size of remote file: 2.03 MB

assets/leaderboard.webp ADDED Viewed

assets/man_hed.png ADDED Viewed

Git LFS Details

SHA256: 6672eea0dc02ec79ed5d655bb3d2d312adbaed05de6329aef281245f74019c96
Pointer size: 131 Bytes
Size of remote file: 247 kB

assets/mask.jpg ADDED Viewed

Git LFS Details

SHA256: c2012f7a9ed8eeefc75df2e7606eb1457c74d5a05a5f3a8d2c3ee6b287624d23
Pointer size: 130 Bytes
Size of remote file: 11.4 kB

assets/mask_inpaint.jpg ADDED Viewed

Git LFS Details

SHA256: 20440426003db3294b70cf943393fa2beb267434ad846fb339b3e407acab0391
Pointer size: 130 Bytes
Size of remote file: 24.6 kB

assets/pose.jpg ADDED Viewed

Git LFS Details

SHA256: c3543f29a838b77933dc439f8520c5eff1bb2075315afbe6eb4b309c477a31f0
Pointer size: 130 Bytes
Size of remote file: 43.5 kB

assets/pose2.jpg ADDED Viewed

Git LFS Details

SHA256: 82005b3e813d714e3a4cf8dddbeddad5047978d6aca78c6a121ad1e7c0ec4b4e
Pointer size: 130 Bytes
Size of remote file: 94.6 kB

assets/pose3.jpg ADDED Viewed

Git LFS Details

SHA256: a12c26c86b54371438ca7f5a134a158a81f7e7b99aa7c4e699ee161e95cd67e4
Pointer size: 130 Bytes
Size of remote file: 65.9 kB

assets/pose4.png ADDED Viewed

Git LFS Details

SHA256: e5e4b3eb95eb9505633cdf9870c5423ca457436e7817c9e66535b430f3c645ab
Pointer size: 130 Bytes
Size of remote file: 33.8 kB

assets/reasoning.png ADDED Viewed

Git LFS Details

SHA256: 96c16b2c8d8dc67bb92ecc22d54b9955ab55136977f515bb76f4b2eb42eb3cdb
Pointer size: 132 Bytes
Size of remote file: 7.7 MB

assets/room_mlsd.png ADDED Viewed

Git LFS Details

SHA256: 64682c9dd17f34ef7a8b2fa12c8c3be6708edca92e42e7f50ed4bf8000b69c11
Pointer size: 129 Bytes
Size of remote file: 7.58 kB

assets/showcase.jpg ADDED Viewed

Git LFS Details

SHA256: f6ee74e066e00596e429f5a08140aebae1678e5935ce1e11ca6c1c6cd72432ee
Pointer size: 132 Bytes
Size of remote file: 6.43 MB

assets/showcase_editing.png ADDED Viewed

Git LFS Details

SHA256: 7d720c3157fd0b0c1f07ac826c6d380b4bcb1b6933c64eb11bfe804ccf7c26f4
Pointer size: 132 Bytes
Size of remote file: 4.75 MB

assets/showcase_realistic.png ADDED Viewed

Git LFS Details

SHA256: 697e6f6857f619314173508df72a14314cbb43e67475de7494123bb8b4f4eb2c
Pointer size: 132 Bytes
Size of remote file: 6.26 MB

assets/showcase_rendering.png ADDED Viewed

Git LFS Details

SHA256: 3556dd66be2200d53f957424e12ecf914ddf3eded151cde86c7353f8b231284f
Pointer size: 132 Bytes
Size of remote file: 7.6 MB

diffusers_local/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .z_image_control_transformer_2d import Transformer2DModelOutput, ZImageControlTransformer2DModel
+__all__ = [
+    "Transformer2DModelOutput",
+    "ZImageControlTransformer2DModel",
+]

diffusers_local/patch.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import importlib
+import os
+from typing import Optional, Set
+import diffusers.loaders.single_file_model as single_file_model
+import diffusers.pipelines.pipeline_loading_utils as pipe_loading_utils
+import torch
+from diffusers.loaders.single_file_utils import (
+    convert_animatediff_checkpoint_to_diffusers,
+    convert_auraflow_transformer_checkpoint_to_diffusers,
+    convert_autoencoder_dc_checkpoint_to_diffusers,
+    convert_chroma_transformer_checkpoint_to_diffusers,
+    convert_controlnet_checkpoint,
+    convert_cosmos_transformer_checkpoint_to_diffusers,
+    convert_flux2_transformer_checkpoint_to_diffusers,
+    convert_flux_transformer_checkpoint_to_diffusers,
+    convert_hidream_transformer_to_diffusers,
+    convert_hunyuan_video_transformer_to_diffusers,
+    convert_ldm_unet_checkpoint,
+    convert_ldm_vae_checkpoint,
+    convert_ltx_transformer_checkpoint_to_diffusers,
+    convert_ltx_vae_checkpoint_to_diffusers,
+    convert_lumina2_to_diffusers,
+    convert_mochi_transformer_checkpoint_to_diffusers,
+    convert_sana_transformer_to_diffusers,
+    convert_sd3_transformer_checkpoint_to_diffusers,
+    convert_stable_cascade_unet_single_file_to_diffusers,
+    convert_wan_transformer_to_diffusers,
+    convert_wan_vae_to_diffusers,
+    convert_z_image_transformer_checkpoint_to_diffusers,
+    create_controlnet_diffusers_config_from_ldm,
+    create_unet_diffusers_config_from_ldm,
+    create_vae_diffusers_config_from_ldm,
+)
+from diffusers.pipelines.pipeline_loading_utils import _unwrap_model
+from diffusers.utils import (
+    _maybe_remap_transformers_class,
+    get_class_from_dynamic_module,
+)
+try:
+    from diffusers.hooks.group_offloading import (
+        _GROUP_ID_LAZY_LEAF,
+        GroupOffloadingConfig,
+        ModelHook,
+        ModuleGroup,
+        _apply_group_offloading_hook,
+        _apply_lazy_group_offloading_hook,
+        _find_parent_module_in_module_dict,
+        _gather_buffers_with_no_group_offloading_parent,
+        _gather_parameters_with_no_group_offloading_parent,
+        send_to_device,
+    )
+except ImportError:
+    ModelHook = object
+    ModuleGroup = object
+    GroupOffloadingConfig = object
+    def _apply_group_offloading_hook(*args, **kwargs):
+        pass
+_MY_GO_LC_SUPPORTED_PYTORCH_LAYERS = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+    torch.nn.Linear,
+    torch.nn.Sequential,  # A camada que queremos adicionar
+)
+class GroupOffloadingHook(ModelHook):
+    r"""
+    A hook that offloads groups of torch.nn.Module to the CPU for storage and onloads to accelerator device for
+    computation. Each group has one "onload leader" module that is responsible for onloading, and an "offload leader"
+    module that is responsible for offloading. If prefetching is enabled, the onload leader of the previous module
+    group is responsible for onloading the current module group.
+    """
+    _is_stateful = False
+    def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None:
+        self.group = group
+        self.next_group: Optional[ModuleGroup] = None
+        self.config = config
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return module
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
+        # method is the onload_leader of the group.
+        if self.group.onload_leader is None:
+            self.group.onload_leader = module
+        if self.group.onload_leader == module:
+            # STEP 1: GUARANTEE THE CURRENT GROUP'S STATE
+            # This section ensures that the parameters for the *current* module are on the correct device
+            # before its forward pass is executed.
+            # This block handles modules that are part of the prefetching chain (`onload_self` is False).
+            # The original design relied on the previous module to initiate the onload, which proved fragile.
+            # Our robust fix makes each module responsible for itself:
+            #   1. `self.group.onload_()`: Guarantees the data transfer is initiated, acting as a backup if the
+            #      previous module in the chain failed to do so.
+            #   2. `self.group.stream.synchronize()`: This is the critical synchronization barrier. It forces the
+            #      CPU to wait until the asynchronous transfer to the GPU is complete, preventing device mismatch errors.
+            if not self.group.onload_self and self.group.stream is not None:
+                self.group.onload_()
+                self.group.stream.synchronize()
+            # This block handles the first module in an execution chain (`onload_self` is True).
+            # It is responsible for loading itself onto the device.
+            if self.group.onload_self:
+                self.group.onload_()
+                # If streams are used, the onload() call above is asynchronous. We MUST synchronize here
+                # to ensure the module is ready before its computation starts.
+                if self.group.stream is not None:
+                    self.group.stream.synchronize()
+            # At this point, we are 100% certain that the current group's parameters are on the onload_device.
+            # STEP 2: INITIATE PREFETCHING FOR THE NEXT GROUP
+            # With the current group secured, we can now look ahead and start the asynchronous data transfer
+            # for the next module in the execution chain. This allows the data transfer to overlap with the
+            # computation of the current module's forward pass, which is the core benefit of prefetching.
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+        # The rest of the function handles moving positional (*args) and keyword (**kwargs)
+        # arguments to the correct device.
+        args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+        else:
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+        return args, kwargs
+    def post_forward(self, module: torch.nn.Module, output):
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return output
+def _apply_group_offloading_leaf_level_patched(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    """
+    Versão corrigida de _apply_group_offloading_leaf_level que suporta nn.Sequential.
+    """
+    modules_with_group_offloading: Set[str] = set()
+    for name, submodule in module.named_modules():
+        if not isinstance(submodule, _MY_GO_LC_SUPPORTED_PYTORCH_LAYERS):
+            continue
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(submodule, group, config=config)
+        modules_with_group_offloading.add(name)
+    # Parameters and Buffers at all non-leaf levels need to be offloaded/onloaded separately when the forward pass
+    # of the module is called
+    module_dict = dict(module.named_modules())
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    # Find closest module parent for each parameter and buffer, and attach group hooks
+    parent_to_parameters = {}
+    for name, param in parameters:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_parameters:
+            parent_to_parameters[parent_name].append(param)
+        else:
+            parent_to_parameters[parent_name] = [param]
+    parent_to_buffers = {}
+    for name, buffer in buffers:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_buffers:
+            parent_to_buffers[parent_name].append(buffer)
+        else:
+            parent_to_buffers[parent_name] = [buffer]
+    parent_names = set(parent_to_parameters.keys()) | set(parent_to_buffers.keys())
+    for name in parent_names:
+        parameters = parent_to_parameters.get(name, [])
+        buffers = parent_to_buffers.get(name, [])
+        parent_module = module_dict[name]
+        group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_leader=parent_module,
+            onload_leader=parent_module,
+            offload_to_disk_path=config.offload_to_disk_path,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(parent_module, group, config=config)
+    if config.stream is not None:
+        # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
+        # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the
+        # execution order and apply prefetching in the correct order.
+        unmatched_group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=None,
+            buffers=None,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=_GROUP_ID_LAZY_LEAF,
+        )
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+try:
+    import diffusers.hooks.group_offloading as group_offloading_module
+    setattr(group_offloading_module, "_apply_group_offloading_leaf_level", _apply_group_offloading_leaf_level_patched)
+    setattr(group_offloading_module, "GroupOffloadingHook", GroupOffloadingHook)
+except ImportError as e:
+    print(f"-> ERRO: Não foi possível importar o módulo `diffusers.hooks.group_offloading` para aplicar o patch: {e}")
+def convert_z_image_control_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    Z_IMAGE_KEYS_RENAME_DICT = {
+        "final_layer.": "all_final_layer.2-1.",
+        "x_embedder.": "all_x_embedder.2-1.",
+        ".attention.out.bias": ".attention.to_out.0.bias",
+        ".attention.k_norm.weight": ".attention.norm_k.weight",
+        ".attention.q_norm.weight": ".attention.norm_q.weight",
+        ".attention.out.weight": ".attention.to_out.0.weight",
+        "control_x_embedder.": "control_all_x_embedder.2-1.",
+    }
+    def convert_z_image_fused_attention(key: str, state_dict: dict[str, object]) -> None:
+        if ".attention.qkv.weight" not in key:
+            return
+        fused_qkv_weight = state_dict.pop(key)
+        to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+        new_q_name = key.replace(".attention.qkv.weight", ".attention.to_q.weight")
+        new_k_name = key.replace(".attention.qkv.weight", ".attention.to_k.weight")
+        new_v_name = key.replace(".attention.qkv.weight", ".attention.to_v.weight")
+        state_dict[new_q_name] = to_q_weight
+        state_dict[new_k_name] = to_k_weight
+        state_dict[new_v_name] = to_v_weight
+        return
+    TRANSFORMER_SPECIAL_KEYS_REMAP = {
+        ".attention.qkv.weight": convert_z_image_fused_attention,
+    }
+    def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str) -> None:
+        state_dict[new_key] = state_dict.pop(old_key)
+    converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
+    # Handle single file --> diffusers key remapping via the remap dict
+    for key in list(converted_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in Z_IMAGE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict(converted_state_dict, key, new_key)
+    # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
+    # special_keys_remap
+    for key in list(converted_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, converted_state_dict)
+    return converted_state_dict
+SINGLE_FILE_LOADABLE_CLASSES = {
+    "StableCascadeUNet": {
+        "checkpoint_mapping_fn": convert_stable_cascade_unet_single_file_to_diffusers,
+    },
+    "UNet2DConditionModel": {
+        "checkpoint_mapping_fn": convert_ldm_unet_checkpoint,
+        "config_mapping_fn": create_unet_diffusers_config_from_ldm,
+        "default_subfolder": "unet",
+        "legacy_kwargs": {
+            "num_in_channels": "in_channels",  # Legacy kwargs supported by `from_single_file` mapped to new args
+        },
+    },
+    "AutoencoderKL": {
+        "checkpoint_mapping_fn": convert_ldm_vae_checkpoint,
+        "config_mapping_fn": create_vae_diffusers_config_from_ldm,
+        "default_subfolder": "vae",
+    },
+    "ControlNetModel": {
+        "checkpoint_mapping_fn": convert_controlnet_checkpoint,
+        "config_mapping_fn": create_controlnet_diffusers_config_from_ldm,
+    },
+    "SD3Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_sd3_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "MotionAdapter": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "SparseControlNetModel": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "FluxTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ChromaTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_chroma_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "LTXVideoTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AutoencoderKLLTXVideo": {
+        "checkpoint_mapping_fn": convert_ltx_vae_checkpoint_to_diffusers,
+        "default_subfolder": "vae",
+    },
+    "AutoencoderDC": {"checkpoint_mapping_fn": convert_autoencoder_dc_checkpoint_to_diffusers},
+    "MochiTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_mochi_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "HunyuanVideoTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_hunyuan_video_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AuraFlowTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_auraflow_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "Lumina2Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_lumina2_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "SanaTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_sana_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "WanTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "WanVACETransformer3DModel": {
+        "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AutoencoderKLWan": {
+        "checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
+        "default_subfolder": "vae",
+    },
+    "HiDreamImageTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_hidream_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "CosmosTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_cosmos_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "QwenImageTransformer2DModel": {
+        "checkpoint_mapping_fn": lambda x: x,
+        "default_subfolder": "transformer",
+    },
+    "Flux2Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux2_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ZImageTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ZImageControlTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_z_image_control_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+}
+def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None):
+    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
+    component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+        class_obj = getattr(pipeline_module, class_name)
+        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
+    elif component_folder and os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+        # load custom component
+        class_obj = get_class_from_dynamic_module(component_folder, module_file=library_name + ".py", class_name=class_name)
+        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+        # Handle deprecated Transformers classes
+        if library_name == "transformers":
+            class_name = _maybe_remap_transformers_class(class_name) or class_name
+        try:
+            class_obj = getattr(library, class_name)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            class_obj = getattr(module, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+    return class_obj, class_candidates
+def _get_single_file_loadable_mapping_class(cls):
+    diffusers_module = importlib.import_module("diffusers")
+    class_name_str = cls.__name__
+    for loadable_class_str in SINGLE_FILE_LOADABLE_CLASSES:
+        try:
+            loadable_class = getattr(diffusers_module, loadable_class_str)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            loadable_class = getattr(module, loadable_class_str)
+        if issubclass(cls, loadable_class):
+            return loadable_class_str
+    return class_name_str
+def maybe_raise_or_warn(library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module):
+    """Simple helper method to raise or warn in case incorrect module has been passed"""
+    if not is_pipeline_module:
+        library = importlib.import_module(library_name)
+        # Handle deprecated Transformers classes
+        if library_name == "transformers":
+            class_name = _maybe_remap_transformers_class(class_name) or class_name
+        try:
+            class_obj = getattr(library, class_name)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            class_obj = getattr(module, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+        expected_class_obj = None
+        for class_name, class_candidate in class_candidates.items():
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
+                expected_class_obj = class_candidate
+        # Dynamo wraps the original model in a private class.
+        # I didn't find a public API to get the original class.
+        sub_model = passed_class_obj[name]
+        unwrapped_sub_model = _unwrap_model(sub_model)
+        model_cls = unwrapped_sub_model.__class__
+        if not issubclass(model_cls, expected_class_obj):
+            raise ValueError(f"{passed_class_obj[name]} is of type: {model_cls}, but should be {expected_class_obj}")
+    else:
+        print(f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it has the correct type")
+pipe_loading_utils.get_class_obj_and_candidates = get_class_obj_and_candidates
+pipe_loading_utils.maybe_raise_or_warn = maybe_raise_or_warn
+single_file_model.SINGLE_FILE_LOADABLE_CLASSES = SINGLE_FILE_LOADABLE_CLASSES
+single_file_model._get_single_file_loadable_mapping_class = _get_single_file_loadable_mapping_class

diffusers_local/pipeline_z_image_control_unified.py ADDED Viewed

	@@ -0,0 +1,910 @@

+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+# Refactored and optimized by DEVAIEXP Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers import AutoencoderKL, DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, ZImageLoraLoaderMixin
+from diffusers.pipelines.z_image.pipeline_output import ZImagePipelineOutput
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from PIL import Image, ImageFilter
+from transformers import AutoTokenizer, PreTrainedModel
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+logger = logging.get_logger(__name__)
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    """
+    Calculates the shift value `mu` for the scheduler based on the image sequence length.
+    This function implements a linear interpolation to determine the shift value based on the input
+    image's sequence length, scaling between a base and a maximum shift value.
+    Args:
+        image_seq_len (`int`):
+            The sequence length of the image latents (height * width).
+        base_seq_len (`int`, *optional*, defaults to 256):
+            The base sequence length for the shift calculation.
+        max_seq_len (`int`, *optional*, defaults to 4096):
+            The maximum sequence length for the shift calculation.
+        base_shift (`float`, *optional*, defaults to 0.5):
+            The shift value corresponding to `base_seq_len`.
+        max_shift (`float`, *optional*, defaults to 1.15):
+            The shift value corresponding to `max_seq_len`.
+    Returns:
+        `float`: The calculated shift value `mu`.
+    """
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def retrieve_latents(encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"):
+    """
+    Retrieves latents from a VAE encoder output.
+    Args:
+        encoder_output (`torch.Tensor`):
+            The output of a VAE encoder.
+        generator (`torch.Generator`, *optional*):
+            A random number generator for sampling from the latent distribution.
+        sample_mode (`str`, *optional*, defaults to "sample"):
+            The method to retrieve latents. Can be "sample" to sample from the distribution or
+            "argmax" to take the mode.
+    Returns:
+        `torch.Tensor`: The retrieved latents.
+    """
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class ZImageControlUnifiedPipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMixin):
+    model_cpu_offload_seq = "text_encoder->vae->transformer"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageControlTransformer2DModel,
+    ):
+        """
+        Initializes the ZImageControlUnifiedPipeline.
+        Args:
+            scheduler (`FlowMatchEulerDiscreteScheduler`):
+                A scheduler to be used in combination with `transformer` to denoise the latents.
+            vae (`AutoencoderKL`):
+                Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            text_encoder (`PreTrainedModel`):
+                A pretrained text encoder model.
+            tokenizer (`AutoTokenizer`):
+                A tokenizer to prepare text prompts for the `text_encoder`.
+            transformer (`ZImageControlTransformer2DModel`):
+                The main transformer model for the diffusion process.
+        """
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True)
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        """
+        Encodes the prompt into text embeddings.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                The prompt or prompts to guide the image generation.
+            device (`Optional[torch.device]`):
+                The device to move the embeddings to.
+            num_images_per_prompt (`int`):
+                The number of images to generate per prompt.
+            do_classifier_free_guidance (`bool`):
+                Whether to generate embeddings for classifier-free guidance.
+            negative_prompt (`Optional[Union[str, List[str]]]`):
+                The negative prompt or prompts.
+            prompt_embeds (`Optional[List[torch.FloatTensor]]`):
+                Pre-generated positive prompt embeddings.
+            negative_prompt_embeds (`Optional[torch.FloatTensor]`):
+                Pre-generated negative prompt embeddings.
+            max_sequence_length (`int`):
+                The maximum sequence length for tokenization.
+        Returns:
+            `Tuple[List[torch.Tensor], List[torch.Tensor]]`: A tuple containing the positive and negative prompt embeddings.
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is not None:
+            pass
+        else:
+            prompt_embeds = self._encode_prompt(
+                prompt=prompt,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is not None:
+                pass
+            else:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+                else:
+                    negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+                assert len(prompt) == len(negative_prompt)
+                negative_prompt_embeds = self._encode_prompt(
+                    prompt=negative_prompt,
+                    device=device,
+                    max_sequence_length=max_sequence_length,
+                )
+            if num_images_per_prompt > 1:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+        return prompt_embeds, negative_prompt_embeds
+    def _encode_prompt(self, prompt: Union[str, List[str]], device: torch.device, max_sequence_length: int) -> List[torch.Tensor]:
+        """
+        Internal helper to encode a list of prompts into embeddings, applying chat templates if available.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                A list of strings to be encoded.
+            device (`torch.device`):
+                The target device for the embeddings.
+            max_sequence_length (`int`):
+                The maximum length for tokenization.
+        Returns:
+            `List[torch.Tensor]`: A list of embedding tensors, one for each prompt.
+        """
+        formatted_prompts = []
+        for p in prompt:
+            messages = [{"role": "user", "content": p}]
+            if hasattr(self.tokenizer, "apply_chat_template"):
+                formatted_prompts.append(self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True))
+            else:
+                formatted_prompts.append(p)
+        text_inputs = self.tokenizer(
+            formatted_prompts,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(device)
+        prompt_masks = text_inputs.attention_mask.bool()
+        with torch.no_grad():
+            prompt_embeds_batch = self.text_encoder(input_ids=text_inputs.input_ids, attention_mask=prompt_masks, output_hidden_states=True).hidden_states[-2]
+        embeddings_list = []
+        for i in range(prompt_embeds_batch.shape[0]):
+            embeddings_list.append(prompt_embeds_batch[i][prompt_masks[i]])
+        return embeddings_list
+    def get_timesteps(self, num_inference_steps, strength, device):
+        """
+        Calculates the timesteps for the scheduler based on the number of inference steps and strength.
+        This is primarily used for image-to-image pipelines.
+        Args:
+            num_inference_steps (`int`): The total number of diffusion steps.
+            strength (`float`): The strength of the denoising process. A value of 1.0 means full denoising.
+            device (`torch.device`): The device to place the timesteps on.
+        Returns:
+            `Tuple[torch.Tensor, int]`: A tuple containing the timesteps and the number of steps to run.
+        """
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: torch.Generator,
+        image: Optional[PipelineImageInput] = None,
+        timestep: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        """
+        Prepares the initial latents for the diffusion process.
+        This function handles three cases:
+        1. `latents` are provided: They are returned directly.
+        2. `image` is None (Text-to-Image): Random noise is generated.
+        3. `image` is provided (Image-to-Image): The image is encoded, and noise is added according to the timestep.
+        Args:
+            batch_size (`int`): The number of latents to generate.
+            num_channels_latents (`int`): The number of channels in the latents.
+            height (`int`): The height of the output image in pixels.
+            width (`int`): The width of the output image in pixels.
+            dtype (`torch.dtype`): The data type for the latents.
+            device (`torch.device`): The device to create the latents on.
+            generator (`torch.Generator`): A random generator for creating the initial noise.
+            image (`Optional[PipelineImageInput]`): An initial image for img2img mode.
+            timestep (`Optional[torch.Tensor]`): The starting timestep for adding noise in img2img mode.
+            latents (`Optional[torch.Tensor]`): Pre-generated latents.
+        Returns:
+            `torch.Tensor`: The prepared latents.
+        """
+        latent_height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        latent_width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, latent_height, latent_width)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        if image is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            return latents
+        image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        with torch.no_grad():
+            if image_tensor.shape[1] != num_channels_latents:
+                if isinstance(generator, list):
+                    image_latents = [retrieve_latents(self.vae.encode(image_tensor[i : i + 1]), generator=generator[i]) for i in range(image_tensor.shape[0])]
+                    image_latents = torch.cat(image_latents, dim=0)
+                else:
+                    image_latents = retrieve_latents(self.vae.encode(image_tensor), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        image_latents = image_latents.to(dtype)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts.")
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        return latents
+    def _prepare_image_latents(
+        self,
+        image: PipelineImageInput,
+        mask_image: PipelineImageInput,
+        width: int,
+        height: int,
+        batch_size: int,
+        num_images_per_prompt: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        do_preprocess: bool = True,
+    ) -> torch.Tensor:
+        """
+        Generic function to encode an image into 5D latents for inpainting context.
+        If `do_preprocess` is True, it processes the image (PIL/np).
+        If `do_preprocess` is False, it assumes 'image' is already a ready-to-use tensor.
+        Args:
+            image (`PipelineImageInput`): The input image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+            do_preprocess (`bool`): Whether to preprocess the image.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the encoded image latents.
+        """
+        if image is None:
+            latent_h = height // self.vae_scale_factor
+            latent_w = width // self.vae_scale_factor
+            shape = (batch_size * num_images_per_prompt, self.transformer.in_channels, 1, latent_h, latent_w)
+            return torch.zeros(shape, device=device, dtype=dtype)
+        if do_preprocess:
+            image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        else:
+            image_tensor = image.to(device=device, dtype=self.vae.dtype)
+        if mask_image is not None:
+            mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+            # Tile para 3 canais (RGB)
+            mask_condition = torch.tile(mask_condition, [1, 3, 1, 1])
+            # Aplica máscara: mantém apenas áreas escuras (< 0.5)
+            image_tensor = image_tensor * (mask_condition < 0.5)
+        with torch.no_grad():
+            latents = retrieve_latents(self.vae.encode(image_tensor), sample_mode="argmax")
+            latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        effective_batch_size = batch_size * num_images_per_prompt
+        if latents.shape[0] != effective_batch_size:
+            repeat_by = effective_batch_size // latents.shape[0]
+            latents = latents.repeat_interleave(repeat_by, dim=0)
+        return latents.to(dtype=dtype).unsqueeze(2)
+    def _prepare_mask_latents(
+        self,
+        mask_image: PipelineImageInput,
+        width: int,
+        height: int,
+        batch_size: int,
+        num_images_per_prompt: int,
+        reference_latents_shape: Tuple,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """
+        Processes a MASK using the mask_processor, inverts it, resizes it, and formats it for the control_context.
+        Args:
+            mask_image (`PipelineImageInput`): The mask image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            reference_latents_shape (`Tuple`): The shape of the inpainting latents for resizing.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the processed mask latents.
+        """
+        if mask_image is None:
+            placeholder_shape = (
+                batch_size * num_images_per_prompt,
+                1,
+                1,
+                reference_latents_shape[-2],
+                reference_latents_shape[-1],
+            )
+            return torch.zeros(placeholder_shape, device=device, dtype=dtype)
+        mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width).to(device=device, dtype=dtype)
+        mask_for_inpainting = 1.0 - mask_condition
+        mask_latents = F.interpolate(mask_for_inpainting, size=reference_latents_shape[-2:], mode="nearest")
+        return mask_latents.unsqueeze(2)
+    def prepare_control_latents(
+        self, image: PipelineImageInput, width: int, height: int, batch_size: int, num_images_per_prompt: int, device: torch.device, dtype: torch.dtype
+    ) -> torch.Tensor:
+        """
+        Preprocesses a control image, ENCODES it with the VAE to latent space,
+        and returns a 5D tensor ready for the transformer model.
+        Args:
+            image (`PipelineImageInput`): The control image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the control image latents.
+        """
+        if image is None:
+            latent_h = 2 * (int(height) // (self.vae_scale_factor * 2))
+            latent_w = 2 * (int(width) // (self.vae_scale_factor * 2))
+            return torch.zeros(
+                (batch_size * num_images_per_prompt, self.transformer.in_channels, 1, latent_h, latent_w),
+                device=device,
+                dtype=dtype,
+            )
+        image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        with torch.no_grad():
+            latents = retrieve_latents(self.vae.encode(image_tensor), sample_mode="argmax")
+            latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        effective_batch_size = batch_size * num_images_per_prompt
+        if latents.shape[0] < effective_batch_size:
+            latents = latents.repeat_interleave(effective_batch_size // latents.shape[0], dim=0)
+        return latents.to(dtype=dtype).unsqueeze(2)
+    def _apply_mask_blur(self, mask_image, mask_blur_radius, is_inpaint_mode):
+        """
+        Apply Gaussian blur to a mask image for inpainting operations.
+        Args:
+            mask_image (Image.Image | np.ndarray | torch.Tensor): The mask image to be blurred.
+                Can be provided as a PIL Image, NumPy array, or PyTorch tensor.
+            mask_blur_radius (float): The radius of the Gaussian blur filter in pixels.
+                Only applied if is_inpaint_mode is True and mask_blur_radius > 0.
+            is_inpaint_mode (bool): Flag indicating whether the pipeline is in inpainting mode.
+                Blur is only applied when this is True.
+        Returns:
+            Image.Image | np.ndarray | torch.Tensor: The mask image with Gaussian blur applied
+                if is_inpaint_mode is True and mask_blur_radius > 0. Otherwise, returns the
+                original mask_image unchanged. The return type matches the input type.
+        """
+        mask_to_use = mask_image
+        if is_inpaint_mode and mask_blur_radius > 0:
+            if isinstance(mask_image, Image.Image):
+                mask_pil = mask_image
+            elif isinstance(mask_image, np.ndarray):
+                mask_pil = Image.fromarray(mask_image)
+            elif isinstance(mask_image, torch.Tensor):
+                mask_pil = Image.fromarray(mask_image.cpu().numpy().astype(np.uint8))
+            else:
+                mask_pil = mask_image
+            mask_to_use = mask_pil.filter(ImageFilter.GaussianBlur(radius=mask_blur_radius))
+        return mask_to_use
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        mask_blur_radius: float = 4.0,
+        control_image: Optional[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        sigmas: Optional[List[float]] = None,
+        strength: float = 1.0,
+        guidance_scale: float = 4.0,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        controlnet_refiner_conditioning_scale: float = 1.0,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The main entry point for the Z-Image unified pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            image (`PipelineImageInput`, *optional*):
+                The initial image for image-to-image or inpainting modes.
+            mask_image (`PipelineImageInput`, *optional*):
+                The mask image for inpainting. White areas are preserved, black areas are inpainted.
+            mask_blur_radius (`float`, *optional*, defaults to 4.0):
+                The radius for blurring the edges of the inpainting mask to create a smoother transition.
+            control_image (`PipelineImageInput`, *optional*):
+                The conditioning image for control modes (e.g., Canny, depth).
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process. If not defined, the scheduler's default behavior
+                will be used.
+            strength (`float`, *optional*, defaults to 1.0):
+                Denoising strength for image-to-image. A value of 1.0 means the initial image is fully replaced,
+                while a lower value preserves more of the original image structure. Only used in img2img mode.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                The scale for classifier-free guidance. A value > 1 enables it. Higher values encourage images
+                closer to the prompt, potentially at the cost of quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply normalization to the guidance, which can prevent oversaturation.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                A value between 0.0 and 1.0 that disables CFG for the final portion of the denoising steps,
+                specified as a fraction of total steps. For example, 0.8 disables CFG for the last 20% of steps.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A torch generator to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents to be used as inputs for image generation.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated positive text embeddings.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings.
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The scale of the control conditioning influence.
+            controlnet_refiner_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The scale of the control refiner conditioning influence.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between "pil" (`PIL.Image.Image`), "np.array", or "latent".
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a `ZImagePipelineOutput` instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary for the `AttentionProcessor`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`:
+            If `return_dict` is True, a `ZImagePipelineOutput` is returned, otherwise a `tuple` with the generated images.
+        """
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+        is_two_stage_control_model = self.transformer.control_in_dim > self.transformer.in_channels if hasattr(self.transformer, "control_in_dim") else False
+        device = self._execution_device
+        dtype = self.transformer.dtype
+        vae_scale = self.vae_scale_factor * 2
+        ref_image = control_image or image
+        image_height = None
+        image_width = None
+        if ref_image is not None:
+            if isinstance(ref_image, Image.Image):
+                image_height, image_width = ref_image.height, ref_image.width
+            else:
+                image_height, image_width = ref_image.shape[-2], ref_image.shape[-1]
+        height = height or image_height or 1024
+        width = width or image_width or 1024
+        if height % vae_scale != 0 or width % vae_scale != 0:
+            raise ValueError(f"Height/width must be divisible by {vae_scale}.")
+        batch_size = len(prompt) if isinstance(prompt, list) else 1 if prompt else len(prompt_embeds)
+        effective_batch_size = batch_size * num_images_per_prompt
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, `negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+        if self.do_classifier_free_guidance:
+            prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+        else:
+            prompt_embeds_model_input = prompt_embeds
+        is_inpaint_mode = image is not None and mask_image is not None
+        is_img2img_mode = image is not None and not is_inpaint_mode
+        if control_image is not None or is_inpaint_mode:
+            control_latents = self.prepare_control_latents(control_image, width, height, batch_size, num_images_per_prompt, device, dtype)
+            if is_two_stage_control_model:
+                mask_to_use = self._apply_mask_blur(mask_image, mask_blur_radius, is_inpaint_mode)
+                inpaint_latents = self._prepare_image_latents(
+                    image, mask_to_use, width, height, batch_size, num_images_per_prompt, device, dtype, do_preprocess=True
+                )
+                mask_latents = self._prepare_mask_latents(
+                    mask_to_use,
+                    width,
+                    height,
+                    batch_size,
+                    num_images_per_prompt,
+                    inpaint_latents.shape,
+                    device,
+                    dtype,
+                )
+                control_context = torch.cat([control_latents, mask_latents, inpaint_latents], dim=1)
+            else:
+                control_context = control_latents
+        else:
+            control_context = None
+        if self.do_classifier_free_guidance:
+            control_context_model_input = control_context.repeat(2, 1, 1, 1, 1)
+        else:
+            control_context_model_input = control_context
+        image_seq_len = (height // (self.vae_scale_factor * 2)) * (width // (self.vae_scale_factor * 2))
+        mu = calculate_shift(image_seq_len)
+        self.scheduler.sigma_min = 0.0
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas, mu=mu)
+        self._num_timesteps = len(timesteps)
+        if is_img2img_mode and not is_inpaint_mode:
+            strength = min(strength, 1.0)
+        else:
+            strength = 1.0
+        if strength < 1.0:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.scheduler.order :]
+            num_steps_to_run = len(timesteps) // self.scheduler.order
+        else:
+            num_steps_to_run = num_inference_steps
+        latent_timestep = timesteps[:1].repeat(effective_batch_size) if strength < 1.0 else None
+        use_image_for_latents = is_img2img_mode and not is_inpaint_mode
+        latents = self.prepare_latents(
+            effective_batch_size,
+            self.transformer.in_channels,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            image=image if use_image_for_latents else None,
+            timestep=latent_timestep if use_image_for_latents else None,
+            latents=latents,
+        )
+        num_warmup_steps = len(timesteps) - num_steps_to_run * self.scheduler.order
+        with torch.inference_mode():
+            with self.progress_bar(total=num_steps_to_run) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+                    timestep = t.expand(latents.shape[0])
+                    timestep = (1000 - timestep) / 1000
+                    t_norm = timestep[0].item()
+                    current_guidance_scale = self.guidance_scale
+                    if self.do_classifier_free_guidance and self._cfg_truncation is not None and float(self._cfg_truncation) <= 1:
+                        if t_norm > self._cfg_truncation:
+                            current_guidance_scale = 0.0
+                    apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+                    if apply_cfg:
+                        latents_typed = latents.to(self.transformer.dtype)
+                        latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                        timestep_model_input = timestep.repeat(2)
+                    else:
+                        latent_model_input = latents.to(self.transformer.dtype)
+                        timestep_model_input = timestep
+                    latent_model_input = latent_model_input.unsqueeze(2)
+                    latent_model_input_list = list(latent_model_input.unbind(dim=0))
+                    model_out_list = self.transformer(
+                        x=latent_model_input_list,
+                        t=timestep_model_input,
+                        cap_feats=prompt_embeds_model_input,
+                        control_context=control_context_model_input,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        refiner_conditioning_scale=controlnet_refiner_conditioning_scale,
+                    )[0]
+                    if apply_cfg:
+                        pos_out = model_out_list[:effective_batch_size]
+                        neg_out = model_out_list[effective_batch_size:]
+                        noise_pred = []
+                        for j in range(effective_batch_size):
+                            pos = pos_out[j].float()
+                            neg = neg_out[j].float()
+                            pred = pos + current_guidance_scale * (pos - neg)
+                            if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                                ori_pos_norm = torch.linalg.vector_norm(pos)
+                                new_pos_norm = torch.linalg.vector_norm(pred)
+                                max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                                if new_pos_norm > max_new_norm:
+                                    pred = pred * (max_new_norm / new_pos_norm)
+                            noise_pred.append(pred)
+                        noise_pred = torch.stack(noise_pred, dim=0)
+                    else:
+                        noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+                    noise_pred = noise_pred.squeeze(2)
+                    noise_pred = -noise_pred
+                    latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        if isinstance(callback_outputs, dict):
+                            latents = callback_outputs.pop("latents", latents)
+                            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                            negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+        if output_type != "latent":
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            with torch.no_grad():
+                image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ZImagePipelineOutput(images=image)

diffusers_local/z_image_control_transformer_2d.py ADDED Viewed

	@@ -0,0 +1,1443 @@

+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+# Refactored and optimized by DEVAIEXP Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import RMSNorm
+from diffusers.utils import (
+    is_torch_version,
+)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch.nn.utils.rnn import pad_sequence
+ADALN_EMBED_DIM = 256
+SEQ_MULTI_OF = 32
+def zero_module(module):
+    """
+    Initializes the parameters of a given module with zeros.
+    Args:
+        module (nn.Module): The module to be zero-initialized.
+    Returns:
+        nn.Module: The same module with its parameters initialized to zero.
+    """
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+class TimestepEmbedder(nn.Module):
+    """
+    A module to embed timesteps into a higher-dimensional space using sinusoidal embeddings
+    followed by a multilayer perceptron (MLP).
+    """
+    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
+        """
+        Initializes the TimestepEmbedder module.
+        Args:
+            out_size (int): The output dimension of the embedding.
+            mid_size (int, optional): The intermediate dimension of the MLP. Defaults to `out_size`.
+            frequency_embedding_size (int, optional): The dimension of the sinusoidal frequency embedding. Defaults to 256.
+        """
+        super().__init__()
+        if mid_size is None:
+            mid_size = out_size
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size,
+                mid_size,
+                bias=True,
+            ),
+            nn.SiLU(),
+            nn.Linear(
+                mid_size,
+                out_size,
+                bias=True,
+            ),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Creates sinusoidal timestep embeddings.
+        Args:
+            t (torch.Tensor): A 1-D Tensor of N timesteps.
+            dim (int): The dimension of the embedding.
+            max_period (int, optional): The maximum period for the sinusoidal frequencies. Defaults to 10000.
+        Returns:
+            torch.Tensor: The timestep embeddings with shape (N, dim).
+        """
+        with torch.amp.autocast("cuda", enabled=False):
+            half = dim // 2
+            freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+            args = t[:, None] * freqs[None]
+            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+            if dim % 2:
+                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+            return embedding
+    def forward(self, t):
+        """
+        Processes the input timesteps to generate embeddings.
+        Args:
+            t (torch.Tensor): The input timesteps.
+        Returns:
+            torch.Tensor: The final timestep embeddings after passing through the MLP.
+        """
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        weight_dtype = self.mlp[0].weight.dtype
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FeedForward(nn.Module):
+    """
+    A Feed-Forward Network module using SwiGLU activation.
+    """
+    def __init__(self, dim: int, hidden_dim: int):
+        """
+        Initializes the FeedForward module.
+        Args:
+            dim (int): Input and output dimension.
+            hidden_dim (int): The hidden dimension of the network.
+        """
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def _forward_silu_gating(self, x1, x3):
+        """
+        Applies the SiLU gating mechanism.
+        Args:
+            x1 (torch.Tensor): The first intermediate tensor.
+            x3 (torch.Tensor): The second intermediate tensor (gate).
+        Returns:
+            torch.Tensor: The result of the gating operation.
+        """
+        return F.silu(x1) * x3
+    def forward(self, x):
+        """
+        Defines the forward pass of the FeedForward network.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class FinalLayer(nn.Module):
+    """
+    The final layer of the transformer, which applies AdaLN modulation and a linear projection.
+    """
+    def __init__(self, hidden_size, out_channels):
+        """
+        Initializes the FinalLayer module.
+        Args:
+            hidden_size (int): The input hidden size.
+            out_channels (int): The output dimension (number of channels).
+        """
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
+        )
+    def forward(self, x, c):
+        """
+        Defines the forward pass for the final layer.
+        Args:
+            x (torch.Tensor): The main input tensor from the transformer blocks.
+            c (torch.Tensor): The conditioning tensor (usually from timestep embedding) for AdaLN modulation.
+        Returns:
+            torch.Tensor: The final output tensor projected to the patch dimension.
+        """
+        scale = 1.0 + self.adaLN_modulation(c)
+        x = self.norm_final(x) * scale.unsqueeze(1)
+        x = self.linear(x)
+        return x
+class RopeEmbedder:
+    """
+    Computes Rotary Positional Embeddings (RoPE) for 3D coordinates.
+    """
+    def __init__(self, theta: float = 256.0, axes_dims: List[int] = (32, 48, 48), axes_lens: List[int] = (1024, 512, 512)):
+        """
+        Initializes the RopeEmbedder.
+        Args:
+            theta (float, optional): The base for the rotary frequencies. Defaults to 256.0.
+            axes_dims (List[int], optional): The dimensions for each axis (F, H, W). Defaults to (32, 48, 48).
+            axes_lens (List[int], optional): The maximum length for each axis. Defaults to (1024, 512, 512).
+        """
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.freqs_cis_cache = {}
+    def _precompute_freqs_cis(self, device):
+        """
+        Precomputes and caches the rotary frequency tensors (cos and sin values).
+        Args:
+            device (torch.device): The device to store the cached tensors on.
+        Returns:
+            List[torch.Tensor]: A list of precomputed frequency tensors for each axis.
+        """
+        if device in self.freqs_cis_cache:
+            return self.freqs_cis_cache[device]
+        freqs_cis_list = []
+        for dim, max_len in zip(self.axes_dims, self.axes_lens):
+            half = dim // 2
+            freqs = 1.0 / (self.theta ** (torch.arange(0, half, device=device, dtype=torch.float32) / half))
+            t = torch.arange(max_len, device=device, dtype=torch.float32)
+            freqs = torch.outer(t, freqs)
+            emb = torch.stack([freqs.cos(), freqs.sin()], dim=-1)
+            freqs_cis_list.append(emb)
+        self.freqs_cis_cache[device] = freqs_cis_list
+        return freqs_cis_list
+    def __call__(self, ids: torch.Tensor):
+        """
+        Generates RoPE embeddings for a batch of 3D coordinates.
+        Args:
+            ids (torch.Tensor): A tensor of coordinates with shape (N, 3).
+        Returns:
+            torch.Tensor: The concatenated RoPE embeddings for the input coordinates.
+        """
+        assert ids.ndim == 2 and ids.shape[1] == len(self.axes_dims)
+        device = ids.device
+        freqs_cis_list = self._precompute_freqs_cis(device)
+        result = []
+        for i in range(len(self.axes_dims)):
+            result.append(freqs_cis_list[i][ids[:, i]])
+        return torch.cat(result, dim=-2)
+class ZSingleStreamAttnProcessor:
+    """
+    An attention processor that applies Rotary Positional Embeddings (RoPE) to query and key tensors
+    before computing scaled dot-product attention.
+    """
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        """
+        Initializes the ZSingleStreamAttnProcessor.
+        """
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        The forward call for the attention processor.
+        Args:
+            attn (Attention): The attention layer that this processor is attached to.
+            hidden_states (torch.Tensor): The input hidden states.
+            encoder_hidden_states (Optional[torch.Tensor], optional): Not used in self-attention. Defaults to None.
+            attention_mask (Optional[torch.Tensor], optional): The attention mask. Defaults to None.
+            freqs_cis (Optional[torch.Tensor], optional): The precomputed RoPE frequencies. Defaults to None.
+        Returns:
+            torch.Tensor: The output of the attention mechanism.
+        """
+        def apply_rotary_emb(q_or_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+            """
+            Applies RoPE to a query or key tensor.
+            """
+            x = q_or_k.transpose(1, 2)
+            x_reshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+            x0 = x_reshaped[..., 0]
+            x1 = x_reshaped[..., 1]
+            freqs_cos = freqs_cis[..., 0].unsqueeze(1)
+            freqs_sin = freqs_cis[..., 1].unsqueeze(1)
+            x_rotated_0 = x0 * freqs_cos - x1 * freqs_sin
+            x_rotated_1 = x0 * freqs_sin + x1 * freqs_cos
+            x_rotated = torch.stack((x_rotated_0, x_rotated_1), dim=-1)
+            x_out = x_rotated.flatten(-2).transpose(1, 2)
+            return x_out.to(q_or_k.dtype)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if freqs_cis is not None:
+            query = apply_rotary_emb(query, freqs_cis)
+            key = apply_rotary_emb(key, freqs_cis)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        output = attn.to_out[0](hidden_states.to(hidden_states.dtype))
+        if len(attn.to_out) > 1:
+            output = attn.to_out[1](output)
+        return output
+@maybe_allow_in_graph
+class ZImageTransformerBlock(nn.Module):
+    """
+    A standard transformer block consisting of a self-attention layer and a feed-forward network.
+    Includes support for AdaLN modulation.
+    """
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+    ):
+        """
+        Initializes the ZImageTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the input and output features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads (not directly used in this simplified attention).
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key tensors.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // n_heads,
+            heads=n_heads,
+            qk_norm="rms_norm" if qk_norm else None,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=ZSingleStreamAttnProcessor(),
+        )
+        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
+        self.layer_id = layer_id
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(
+                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
+            )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        """
+        Returns a dictionary of all attention processors used in the module.
+        """
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        """
+        Sets the attention processor for the attention layer in this block.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def forward(self, x, attn_mask, freqs_cis, adaln_input=None):
+        """
+        Defines the forward pass for the transformer block.
+        Args:
+            x (torch.Tensor): The input tensor.
+            attn_mask (torch.Tensor): The attention mask.
+            freqs_cis (torch.Tensor): The RoPE frequencies.
+            adaln_input (torch.Tensor, optional): The conditioning tensor for AdaLN. Defaults to None.
+        Returns:
+            torch.Tensor: The output tensor of the block.
+        """
+        if self.modulation:
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            scale_msa = scale_msa + 1.0
+            gate_msa = gate_msa.tanh()
+            scale_mlp = scale_mlp + 1.0
+            gate_mlp = gate_mlp.tanh()
+            normed = self.attention_norm1(x)
+            normed = normed * scale_msa
+            attn_out = self.attention(normed, attention_mask=attn_mask, freqs_cis=freqs_cis)
+            attn_out = self.attention_norm2(attn_out) * gate_msa
+            x = x + attn_out
+            normed = self.ffn_norm1(x)
+            normed = normed * scale_mlp
+            ffn_out = self.feed_forward(normed)
+            ffn_out = self.ffn_norm2(ffn_out) * gate_mlp
+            x = x + ffn_out
+        else:
+            normed = self.attention_norm1(x)
+            attn_out = self.attention(normed, attention_mask=attn_mask, freqs_cis=freqs_cis)
+            x = x + self.attention_norm2(attn_out)
+            normed = self.ffn_norm1(x)
+            ffn_out = self.feed_forward(normed)
+            x = x + self.ffn_norm2(ffn_out)
+        return x
+class ZImageControlTransformerBlock(ZImageTransformerBlock):
+    """
+    A specialized transformer block for the control pathway. It inherits from ZImageTransformerBlock
+    and adds projection layers to generate and combine control signals.
+    """
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, qk_norm: bool, modulation=True, block_id=0):
+        """
+        Initializes the ZImageControlTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads.
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+            block_id (int, optional): The index of this control block. Defaults to 0.
+        """
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = zero_module(nn.Linear(self.dim, self.dim))
+        self.after_proj = zero_module(nn.Linear(self.dim, self.dim))
+    def forward(self, c, x, **kwargs):
+        """
+        Defines the forward pass for the control block.
+        Args:
+            c (torch.Tensor): The control signal tensor.
+            x (torch.Tensor): The reference tensor from the main pathway.
+            **kwargs: Additional arguments for the parent's forward method.
+        Returns:
+            torch.Tensor: A stacked tensor containing the skip connection and the final output.
+        """
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+class BaseZImageTransformerBlock(ZImageTransformerBlock):
+    """
+    The main transformer block used in the primary pathway. It inherits from ZImageTransformerBlock
+    and adds the logic to inject control "hints" from the control pathway.
+    """
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, qk_norm: bool, modulation=True, block_id=0):
+        """
+        Initializes the BaseZImageTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads.
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+            block_id (int, optional): The index used to retrieve the corresponding control hint. Defaults to 0.
+        """
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation)
+        self.block_id = block_id
+    def forward(self, hidden_states, hints=None, context_scale=1.0, **kwargs):
+        """
+        Defines the forward pass, including the injection of control hints.
+        Args:
+            hidden_states (torch.Tensor): The input tensor.
+            hints (List[torch.Tensor], optional): A list of control hints from the control pathway. Defaults to None.
+            context_scale (float, optional): A scale factor for the control hints. Defaults to 1.0.
+            **kwargs: Additional arguments for the parent's forward method.
+        Returns:
+            torch.Tensor: The output tensor of the block.
+        """
+        hidden_states = super().forward(hidden_states, **kwargs)
+        if self.block_id is not None and hints is not None:
+            hidden_states = hidden_states + hints[self.block_id] * context_scale
+        return hidden_states
+class ZImageControlTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [
+        r"control_layers\..*",
+        r"control_noise_refiner\..*",
+        r"control_all_x_embedder\..*",
+    ]
+    _no_split_modules = ["ZImageTransformerBlock", "BaseZImageTransformerBlock", "ZImageControlTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"]
+    _group_offload_block_modules = ["t_embedder", "cap_embedder"]
+    @register_to_config
+    def __init__(
+        self,
+        control_layers_places=None,
+        control_refiner_layers_places=None,
+        control_in_dim=None,
+        add_control_noise_refiner=False,
+        all_patch_size=(2,),
+        all_f_patch_size=(1,),
+        in_channels=16,
+        dim=3840,
+        n_layers=30,
+        n_refiner_layers=2,
+        n_heads=30,
+        n_kv_heads=30,
+        norm_eps=1e-5,
+        qk_norm=True,
+        cap_feat_dim=2560,
+        rope_theta=256.0,
+        t_scale=1000.0,
+        axes_dims=[32, 48, 48],
+        axes_lens=[1024, 512, 512],
+        use_controlnet=True,
+        checkpoint_ratio=0.5,
+    ):
+        """
+        Initializes the ZImageControlTransformer2DModel.
+        Args:
+            control_layers_places (List[int], optional): Indices of main layers where control hints are injected.
+            control_refiner_layers_places (List[int], optional): Indices of noise refiner layers for two-stage control.
+            control_in_dim (int, optional): Input channel dimension for the control context.
+            add_control_noise_refiner (bool, optional): Whether to add a dedicated refiner for the control signal.
+            all_patch_size (Tuple[int], optional): Tuple of patch sizes for spatial dimensions.
+            all_f_patch_size (Tuple[int], optional): Tuple of patch sizes for the frame dimension.
+            in_channels (int, optional): Number of input channels for the latent image.
+            dim (int, optional): The main dimension of the transformer model.
+            n_layers (int, optional): The number of main transformer layers.
+            n_refiner_layers (int, optional): The number of layers in the refiner blocks.
+            n_heads (int, optional): The number of attention heads.
+            n_kv_heads (int, optional): The number of key/value heads.
+            norm_eps (float, optional): Epsilon for RMSNorm.
+            qk_norm (bool, optional): Whether to apply normalization to query and key.
+            cap_feat_dim (int, optional): The dimension of the input caption features.
+            rope_theta (float, optional): The base for RoPE.
+            t_scale (float, optional): A scaling factor for the timestep.
+            axes_dims (List[int], optional): Dimensions for each axis in RoPE.
+            axes_lens (List[int], optional): Maximum lengths for each axis in RoPE.
+            use_controlnet (bool, optional): If False, control-related layers will not be created to save memory.
+            checkpoint_ratio (float, optional): The ratio of layers to apply gradient checkpointing to.
+        """
+        super().__init__()
+        self.use_controlnet = use_controlnet
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.all_patch_size = all_patch_size
+        self.all_f_patch_size = all_f_patch_size
+        self.dim = dim
+        self.control_in_dim = self.dim if control_in_dim is None else control_in_dim
+        self.is_two_stage_control = self.control_in_dim > 16
+        self.n_heads = n_heads
+        self.rope_theta = rope_theta
+        self.t_scale = t_scale
+        self.gradient_checkpointing = False
+        self.checkpoint_ratio = checkpoint_ratio
+        assert len(all_patch_size) == len(all_f_patch_size)
+        self.control_layers_places = list(range(0, n_layers, 2)) if control_layers_places is None else control_layers_places
+        self.control_refiner_layers_places = list(range(0, n_refiner_layers)) if control_refiner_layers_places is None else control_refiner_layers_places
+        self.add_control_noise_refiner = add_control_noise_refiner
+        assert 0 in self.control_layers_places
+        self.control_layers_mapping = {i: n for n, i in enumerate(self.control_layers_places)}
+        self.control_refiner_layers_mapping = {i: n for n, i in enumerate(self.control_refiner_layers_places)}
+        self.all_x_embedder = nn.ModuleDict(
+            {
+                f"{patch_size}-{f_patch_size}": nn.Linear(f_patch_size * patch_size * patch_size * in_channels, dim, bias=True)
+                for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+            }
+        )
+        self.all_final_layer = nn.ModuleDict(
+            {
+                f"{patch_size}-{f_patch_size}": FinalLayer(dim, patch_size * patch_size * f_patch_size * self.out_channels)
+                for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+            }
+        )
+        self.context_refiner = nn.ModuleList(
+            [ZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=False) for i in range(n_refiner_layers)]
+        )
+        self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
+        self.cap_embedder = nn.Sequential(RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, dim, bias=True))
+        self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
+        self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
+        head_dim = dim // n_heads
+        assert head_dim == sum(axes_dims)
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=axes_dims, axes_lens=axes_lens)
+        self.layers = nn.ModuleList(
+            [BaseZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=self.control_layers_mapping.get(i)) for i in range(n_layers)]
+        )
+        self.noise_refiner = nn.ModuleList(
+            [
+                BaseZImageTransformerBlock(
+                    1000 + i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True, block_id=self.control_refiner_layers_mapping.get(i)
+                )
+                for i in range(n_refiner_layers)
+            ]
+        )
+        if self.use_controlnet:
+            self.control_layers = nn.ModuleList(
+                [ZImageControlTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=i) for i in self.control_layers_places]
+            )
+            self.control_all_x_embedder = nn.ModuleDict(
+                {
+                    f"{patch_size}-{f_patch_size}": nn.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True)
+                    for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+                }
+            )
+            if self.is_two_stage_control:
+                if self.add_control_noise_refiner:
+                    self.control_noise_refiner = nn.ModuleList(
+                        [
+                            ZImageControlTransformerBlock(1000 + layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True, block_id=layer_id)
+                            for layer_id in range(n_refiner_layers)
+                        ]
+                    )
+                else:
+                    self.control_noise_refiner = None
+            else:  # V1
+                self.control_noise_refiner = nn.ModuleList(
+                    [ZImageTransformerBlock(1000 + i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True) for i in range(n_refiner_layers)]
+                )
+        else:
+            self.control_layers = None
+            self.control_all_x_embedder = None
+            self.control_noise_refiner = None
+    def _unpatchify(self, x_image_tokens: torch.Tensor, all_sizes: List[Tuple], patch_size: int, f_patch_size: int) -> torch.Tensor:
+        """
+        Converts a sequence of image tokens back into a batched image tensor. This version is robust
+        to batches containing images of different original sizes.
+        Args:
+            x_image_tokens (torch.Tensor): A tensor of image tokens with shape [B, SeqLen, Dim].
+            all_sizes (List[Tuple]): A list of tuples with the original (F, H, W) size for each image in the batch.
+            patch_size (int): The spatial patch size (height and width).
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            torch.Tensor: The reconstructed latent tensor with shape [B, C, F, H, W].
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        batch_size = x_image_tokens.shape[0]
+        unpatched_images = []
+        for i in range(batch_size):
+            F, H, W = all_sizes[i]
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            original_seq_len = F_tokens * H_tokens * W_tokens
+            current_image_tokens = x_image_tokens[i, :original_seq_len, :]
+            unpatched_image = current_image_tokens.view(F_tokens, H_tokens, W_tokens, pF, pH, pW, self.out_channels)
+            unpatched_image = unpatched_image.permute(6, 0, 3, 1, 4, 2, 5).reshape(self.out_channels, F, H, W)
+            unpatched_images.append(unpatched_image)
+        try:
+            final_tensor = torch.stack(unpatched_images, dim=0)
+        except RuntimeError:
+            raise ValueError(
+                "Could not stack unpatched images into a single batch tensor. "
+                "This typically occurs if you are trying to generate images of different sizes in the same batch."
+            )
+        return final_tensor
+    def _patchify(
+        self,
+        all_image: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+        cap_padding_len: int,
+    ):
+        """
+        Converts a list of image tensors into patch sequences and computes their positional IDs.
+        Args:
+            all_image (List[torch.Tensor]): A list of image tensors to process.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+            cap_padding_len (int): The length of the padded caption sequence, used as an offset for image position IDs.
+        Returns:
+            Tuple: A tuple containing lists of processed patches, sizes, position IDs, and padding masks.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        all_image_out = []
+        all_image_size = []
+        all_image_pos_ids = []
+        all_image_pad_mask = []
+        for i, image in enumerate(all_image):
+            C, F, H, W = image.size()
+            all_image_size.append((F, H, W))
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+            image_ori_pos_ids = self._create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            image_padding_pos_ids = (
+                self._create_coordinate_grid(
+                    size=(1, 1, 1),
+                    start=(0, 0, 0),
+                    device=device,
+                )
+                .flatten(0, 2)
+                .repeat(image_padding_len, 1)
+            )
+            image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
+            all_image_pos_ids.append(image_padded_pos_ids)
+            all_image_pad_mask.append(
+                torch.cat(
+                    [
+                        torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
+                        torch.ones((image_padding_len,), dtype=torch.bool, device=device),
+                    ],
+                    dim=0,
+                )
+            )
+            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
+            all_image_out.append(image_padded_feat)
+        return (
+            all_image_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_image_pad_mask,
+        )
+    def _patchify_and_embed(
+        self,
+        all_image: List[torch.Tensor],
+        all_cap_feats: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        """
+        Processes a batch of images and caption features by converting them into padded patch sequences
+        and generating their corresponding positional IDs and padding masks. This is the general-purpose,
+        robust version that iterates through the batch.
+        Args:
+            all_image (List[torch.Tensor]): A list of image tensors.
+            all_cap_feats (List[torch.Tensor]): A list of caption feature tensors.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            Tuple: A tuple containing all processed data structures (image patches, caption features, sizes,
+                   position IDs, and padding masks) as lists.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        all_image_out, all_image_size, all_image_pos_ids, all_image_pad_mask = [], [], [], []
+        all_cap_pos_ids, all_cap_pad_mask, all_cap_feats_out = [], [], []
+        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
+            cap_ori_len = len(cap_feat)
+            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+            cap_total_len = cap_ori_len + cap_padding_len
+            cap_padded_pos_ids = self._create_coordinate_grid(size=(cap_total_len, 1, 1), start=(1, 0, 0), device=device).flatten(0, 2)
+            all_cap_pos_ids.append(cap_padded_pos_ids)
+            cap_mask = torch.ones(cap_total_len, dtype=torch.bool, device=device)
+            cap_mask[:cap_ori_len] = False
+            all_cap_pad_mask.append(cap_mask)
+            if cap_padding_len > 0:
+                padding_tensor = cap_feat[-1:].repeat(cap_padding_len, 1)
+                cap_padded_feat = torch.cat([cap_feat, padding_tensor], dim=0)
+            else:
+                cap_padded_feat = cap_feat
+            all_cap_feats_out.append(cap_padded_feat)
+            C, Fr, H, W = image.size()
+            all_image_size.append((Fr, H, W))
+            F_tokens, H_tokens, W_tokens = Fr // pF, H // pH, W // pW
+            image_reshaped = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW).permute(1, 3, 5, 2, 4, 6, 0).reshape(-1, pF * pH * pW * C)
+            image_ori_len = image_reshaped.shape[0]
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+            image_total_len = image_ori_len + image_padding_len
+            image_ori_pos_ids = self._create_coordinate_grid(size=(F_tokens, H_tokens, W_tokens), start=(cap_total_len + 1, 0, 0), device=device).flatten(0, 2)
+            if image_padding_len > 0:
+                image_padding_pos_ids = torch.zeros((image_padding_len, 3), dtype=torch.int32, device=device)
+                image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
+            else:
+                image_padded_pos_ids = image_ori_pos_ids
+            all_image_pos_ids.append(image_padded_pos_ids)
+            image_mask = torch.ones(image_total_len, dtype=torch.bool, device=device)
+            image_mask[:image_ori_len] = False
+            all_image_pad_mask.append(image_mask)
+            if image_padding_len > 0:
+                padding_tensor = image_reshaped[-1:].repeat(image_padding_len, 1)
+                image_padded_feat = torch.cat([image_reshaped, padding_tensor], dim=0)
+            else:
+                image_padded_feat = image_reshaped
+            all_image_out.append(image_padded_feat)
+        return (
+            all_image_out,
+            all_cap_feats_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_cap_pos_ids,
+            all_image_pad_mask,
+            all_cap_pad_mask,
+        )
+    def _process_cap_feats_with_cfg_cache(self, cap_feats_list, cap_pos_ids, cap_inner_pad_mask):
+        """
+        Processes caption features with intelligent duplicate detection to avoid redundant computation,
+        especially for Classifier-Free Guidance (CFG) where prompts are repeated.
+        Args:
+            cap_feats_list (List[torch.Tensor]): List of padded caption feature tensors.
+            cap_pos_ids (List[torch.Tensor]): List of corresponding position ID tensors.
+            cap_inner_pad_mask (List[torch.Tensor]): List of corresponding padding masks.
+        Returns:
+            Tuple: A tuple of batched tensors for padded features, RoPE frequencies, attention mask, and sequence lengths.
+        """
+        device = cap_feats_list[0].device
+        bsz = len(cap_feats_list)
+        shapes_equal = all(c.shape == cap_feats_list[0].shape for c in cap_feats_list)
+        if shapes_equal and bsz >= 2:
+            unique_indices = [0]
+            unique_tensors = [cap_feats_list[0]]
+            tensor_mapping = [0]
+            for i in range(1, bsz):
+                found_match = False
+                for j, unique_tensor in enumerate(unique_tensors):
+                    if torch.equal(cap_feats_list[i], unique_tensor):
+                        tensor_mapping.append(j)
+                        found_match = True
+                        break
+                if not found_match:
+                    unique_indices.append(i)
+                    unique_tensors.append(cap_feats_list[i])
+                    tensor_mapping.append(len(unique_tensors) - 1)
+            if len(unique_tensors) < bsz:
+                unique_cap_feats_list = [cap_feats_list[i] for i in unique_indices]
+                unique_cap_pos_ids = [cap_pos_ids[i] for i in unique_indices]
+                unique_cap_inner_pad_mask = [cap_inner_pad_mask[i] for i in unique_indices]
+                cap_item_seqlens_unique = [len(i) for i in unique_cap_feats_list]
+                cap_max_item_seqlen = max(cap_item_seqlens_unique)
+                cap_feats_cat = torch.cat(unique_cap_feats_list, dim=0)
+                cap_feats_embedded = self.cap_embedder(cap_feats_cat)
+                cap_feats_embedded[torch.cat(unique_cap_inner_pad_mask)] = self.cap_pad_token
+                cap_feats_padded_unique = pad_sequence(list(cap_feats_embedded.split(cap_item_seqlens_unique, dim=0)), batch_first=True, padding_value=0.0)
+                cap_freqs_cis_cat = self.rope_embedder(torch.cat(unique_cap_pos_ids, dim=0))
+                cap_freqs_cis_unique = pad_sequence(list(cap_freqs_cis_cat.split(cap_item_seqlens_unique, dim=0)), batch_first=True, padding_value=0.0)
+                cap_feats_padded = cap_feats_padded_unique[tensor_mapping]
+                cap_freqs_cis = cap_freqs_cis_unique[tensor_mapping]
+                seq_lens_tensor = torch.tensor([cap_max_item_seqlen] * bsz, device=device, dtype=torch.int32)
+                arange = torch.arange(cap_max_item_seqlen, device=device, dtype=torch.int32)
+                cap_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+                cap_item_seqlens = [cap_max_item_seqlen] * bsz
+                return cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens
+        cap_item_seqlens = [len(i) for i in cap_feats_list]
+        cap_max_item_seqlen = max(cap_item_seqlens)
+        cap_feats_cat = torch.cat(cap_feats_list, dim=0)
+        cap_feats_embedded = self.cap_embedder(cap_feats_cat)
+        cap_feats_embedded[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
+        cap_feats_padded = pad_sequence(list(cap_feats_embedded.split(cap_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        cap_freqs_cis_cat = self.rope_embedder(torch.cat(cap_pos_ids, dim=0))
+        cap_freqs_cis = pad_sequence(list(cap_freqs_cis_cat.split(cap_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(cap_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(cap_max_item_seqlen, device=device, dtype=torch.int32)
+        cap_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        return cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens
+    @staticmethod
+    def _create_coordinate_grid(size, start=None, device=None):
+        """
+        Creates a 3D coordinate grid.
+        Args:
+            size (Tuple[int]): The dimensions of the grid (F, H, W).
+            start (Tuple[int], optional): The starting coordinates for each axis. Defaults to (0, 0, 0).
+            device (torch.device, optional): The device to create the tensor on. Defaults to None.
+        Returns:
+            torch.Tensor: The coordinate grid tensor.
+        """
+        if start is None:
+            start = (0 for _ in size)
+        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
+        grids = torch.meshgrid(axes, indexing="ij")
+        return torch.stack(grids, dim=-1)
+    def _apply_transformer_blocks(self, hidden_states, layers, checkpoint_ratio=0.5, **kwargs):
+        """
+        Applies a list of transformer layers to the hidden states, with optional selective gradient checkpointing.
+        Args:
+            hidden_states (torch.Tensor): The input tensor.
+            layers (nn.ModuleList): The list of transformer layers to apply.
+            checkpoint_ratio (float, optional): The ratio of layers to apply gradient checkpointing to. Defaults to 0.5.
+            **kwargs: Additional keyword arguments to pass to each layer's forward method.
+        Returns:
+            torch.Tensor: The output tensor after applying all layers.
+        """
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module, **static_kwargs):
+                def custom_forward(*inputs):
+                    return module(*inputs, **static_kwargs)
+                return custom_forward
+            ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            checkpoint_every_n = max(1, int(1.0 / checkpoint_ratio)) if checkpoint_ratio > 0 else len(layers) + 1
+            for i, layer in enumerate(layers):
+                if i % checkpoint_every_n == 0:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer, **kwargs),
+                        hidden_states,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states = layer(hidden_states, **kwargs)
+        else:
+            for layer in layers:
+                hidden_states = layer(hidden_states, **kwargs)
+        return hidden_states
+    def _prepare_control_inputs(self, control_context, cap_feats_ref, t, patch_size, f_patch_size, device):
+        """
+        Prepares the control context for the transformer, including patchifying, embedding, and generating
+        positional information. Includes a fast path for batches with uniform shapes.
+        Args:
+            control_context (torch.Tensor or List[torch.Tensor]): The control context input.
+            cap_feats_ref (List[torch.Tensor]): A reference to caption features for padding calculation.
+            t (torch.Tensor): The timestep tensor.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+            device (torch.device): The target device.
+        Returns:
+            Dict: A dictionary containing the processed control tensors ('c', 'c_item_seqlens', 'attn_mask', etc.).
+        """
+        bsz = control_context.shape[0]
+        if isinstance(control_context, torch.Tensor) and control_context.ndim == 5:
+            control_list = list(torch.unbind(control_context, dim=0))
+        else:
+            control_list = control_context
+        pH = pW = patch_size
+        pF = f_patch_size
+        cap_padding_len = cap_feats_ref[0].size(0) if isinstance(cap_feats_ref, list) else cap_feats_ref.shape[1]
+        shapes = [c.shape for c in control_list]
+        same_shape = all(s == shapes[0] for s in shapes)
+        if same_shape and bsz >= 2:
+            control_batch = torch.stack(control_list, dim=0)
+            B, C, F, H, W = control_batch.shape
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            control_batch = control_batch.view(B, C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            control_batch = control_batch.permute(0, 2, 4, 6, 3, 5, 7, 1).reshape(B, F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+            ori_len = control_batch.shape[1]
+            padding_len = (-ori_len) % SEQ_MULTI_OF
+            if padding_len > 0:
+                pad_tensor = control_batch[:, -1:, :].repeat(1, padding_len, 1)
+                control_batch = torch.cat([control_batch, pad_tensor], dim=1)
+            c = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_batch)
+            final_seq_len = control_batch.shape[1]
+            pos_ids_ori = self._create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)  # [ori_len, 3]
+            pos_ids_pad = torch.zeros((padding_len, 3), dtype=torch.int32, device=device)
+            pos_ids_padded = torch.cat([pos_ids_ori, pos_ids_pad], dim=0)
+            c_freqs_cis_single = self.rope_embedder(pos_ids_padded)
+            c_freqs_cis = c_freqs_cis_single.unsqueeze(0).repeat(B, 1, 1, 1)
+            c_attn_mask = torch.ones((B, final_seq_len), dtype=torch.bool, device=device)
+            return {"c": c, "c_item_seqlens": [final_seq_len] * B, "attn_mask": c_attn_mask, "freqs_cis": c_freqs_cis, "adaln_input": t.type_as(c)}
+        (c_patches, _, c_pos_ids, c_inner_pad_mask) = self._patchify(control_list, patch_size, f_patch_size, cap_padding_len)
+        c_item_seqlens = [len(p) for p in c_patches]
+        c_max_item_seqlen = max(c_item_seqlens)
+        c = torch.cat(c_patches, dim=0)
+        c = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](c)
+        c[torch.cat(c_inner_pad_mask)] = self.x_pad_token
+        c = list(c.split(c_item_seqlens, dim=0))
+        c_freqs_cis_list = []
+        for pos_ids in c_pos_ids:
+            c_freqs_cis_list.append(self.rope_embedder(pos_ids))
+        c_padded = pad_sequence(c, batch_first=True, padding_value=0.0)
+        c_freqs_cis_padded = pad_sequence(c_freqs_cis_list, batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(c_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(c_max_item_seqlen, device=device, dtype=torch.int32)
+        c_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        return {"c": c_padded, "c_item_seqlens": c_item_seqlens, "attn_mask": c_attn_mask, "freqs_cis": c_freqs_cis_padded, "adaln_input": t.type_as(c_padded)}
+    def _patchify_and_embed_batch_optimized(self, all_image, all_cap_feats, patch_size, f_patch_size):
+        """
+        An optimized version of _patchify_and_embed for batches where all images and captions have
+        uniform shapes. It processes the entire batch using vectorized operations instead of a loop.
+        Args:
+            all_image (List[torch.Tensor]): List of image tensors, all of the same shape.
+            all_cap_feats (List[torch.Tensor]): List of caption features, all of the same shape.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            Tuple: A tuple containing all processed data structures, matching the output of the standard method.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        image_shapes = [img.shape for img in all_image]
+        cap_shapes = [cap.shape for cap in all_cap_feats]
+        same_image_shape = all(s == image_shapes[0] for s in image_shapes)
+        same_cap_shape = all(s == cap_shapes[0] for s in cap_shapes)
+        if not (same_image_shape and same_cap_shape):
+            return self._patchify_and_embed(all_image, all_cap_feats, patch_size, f_patch_size)
+        images_batch = torch.stack(all_image, dim=0)
+        caps_batch = torch.stack(all_cap_feats, dim=0)
+        B, C, Fr, H, W = images_batch.shape
+        cap_ori_len = caps_batch.shape[1]
+        cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+        cap_total_len = cap_ori_len + cap_padding_len
+        if cap_padding_len > 0:
+            cap_pad = caps_batch[:, -1:, :].repeat(1, cap_padding_len, 1)
+            caps_batch = torch.cat([caps_batch, cap_pad], dim=1)
+        cap_pos_ids = self._create_coordinate_grid(size=(cap_total_len, 1, 1), start=(1, 0, 0), device=device).flatten(0, 2).unsqueeze(0).repeat(B, 1, 1)
+        cap_mask = torch.zeros((B, cap_total_len), dtype=torch.bool, device=device)
+        if cap_padding_len > 0:
+            cap_mask[:, cap_ori_len:] = True
+        F_tokens, H_tokens, W_tokens = Fr // pF, H // pH, W // pW
+        images_reshaped = (
+            images_batch.view(B, C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            .permute(0, 2, 4, 6, 3, 5, 7, 1)
+            .reshape(B, F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+        )
+        image_ori_len = images_reshaped.shape[1]
+        image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+        image_total_len = image_ori_len + image_padding_len
+        if image_padding_len > 0:
+            img_pad = images_reshaped[:, -1:, :].repeat(1, image_padding_len, 1)
+            images_reshaped = torch.cat([images_reshaped, img_pad], dim=1)
+        image_pos_ids = (
+            self._create_coordinate_grid(size=(F_tokens, H_tokens, W_tokens), start=(cap_total_len + 1, 0, 0), device=device)
+            .flatten(0, 2)
+            .unsqueeze(0)
+            .repeat(B, 1, 1)
+        )
+        if image_padding_len > 0:
+            img_pos_pad = torch.zeros((B, image_padding_len, 3), dtype=torch.int32, device=device)
+            image_pos_ids = torch.cat([image_pos_ids, img_pos_pad], dim=1)
+        image_mask = torch.zeros((B, image_total_len), dtype=torch.bool, device=device)
+        if image_padding_len > 0:
+            image_mask[:, image_ori_len:] = True
+        all_image_size = [(Fr, H, W)] * B
+        return (
+            list(torch.unbind(images_reshaped, dim=0)),
+            list(torch.unbind(caps_batch, dim=0)),
+            all_image_size,
+            list(torch.unbind(image_pos_ids, dim=0)),
+            list(torch.unbind(cap_pos_ids, dim=0)),
+            list(torch.unbind(image_mask, dim=0)),
+            list(torch.unbind(cap_mask, dim=0)),
+        )
+    def forward(
+        self,
+        x: List[torch.Tensor],
+        t,
+        cap_feats: List[torch.Tensor],
+        patch_size=2,
+        f_patch_size=1,
+        control_context=None,
+        conditioning_scale=1.0,
+        refiner_conditioning_scale=1.0,
+    ):
+        """
+        The main forward pass of the transformer model.
+        Args:
+            x (List[torch.Tensor]):
+                A list of latent image tensors.
+            t (torch.Tensor):
+                A batch of timesteps.
+            cap_feats (List[torch.Tensor]):
+                A list of caption feature tensors.
+            patch_size (int, optional):
+                The spatial patch size to use. Defaults to 2.
+            f_patch_size (int, optional):
+                The frame/temporal patch size to use. Defaults to 1.
+            control_context (torch.Tensor, optional):
+                The control context tensor. Defaults to None.
+            conditioning_scale (float, optional):
+                The scale for applying control hints. Defaults to 1.0.
+            refiner_conditioning_scale (float, optional):
+                The scale for applying refiner control hints. Defaults to 1.0.
+        Returns:
+            Transformer2DModelOutput: An object containing the final denoised sample.
+        """
+        is_control_mode = self.use_controlnet and control_context is not None and conditioning_scale > 0
+        if refiner_conditioning_scale is None:
+            refiner_conditioning_scale = conditioning_scale or 1.0
+        assert patch_size in self.all_patch_size
+        assert f_patch_size in self.all_f_patch_size
+        bsz = len(x)
+        device = x[0].device
+        t = t * self.t_scale
+        t = self.t_embedder(t)
+        can_optimize_patchify = (
+            bsz == len(cap_feats) and bsz >= 2 and all(img.shape == x[0].shape for img in x) and all(cap.shape == cap_feats[0].shape for cap in cap_feats)
+        )
+        if can_optimize_patchify:
+            (x_list, cap_feats_list, x_size, x_pos_ids, cap_pos_ids, x_inner_pad_mask, cap_inner_pad_mask) = self._patchify_and_embed_batch_optimized(
+                x, cap_feats, patch_size, f_patch_size
+            )
+        else:
+            (x_list, cap_feats_list, x_size, x_pos_ids, cap_pos_ids, x_inner_pad_mask, cap_inner_pad_mask) = self._patchify_and_embed(
+                x, cap_feats, patch_size, f_patch_size
+            )
+        x_item_seqlens = [len(i) for i in x_list]
+        x_max_item_seqlen = max(x_item_seqlens) if x_item_seqlens else 0
+        x_cat = torch.cat(x_list, dim=0) if x_list else torch.empty(0, x_list[0].shape[1] if x_list else 0, device=device)
+        x_embedded = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x_cat)
+        if x_inner_pad_mask and torch.cat(x_inner_pad_mask).any():
+            x_embedded[torch.cat(x_inner_pad_mask)] = self.x_pad_token
+        x = pad_sequence(list(x_embedded.split(x_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        adaln_input = t.to(device).type_as(x)
+        cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens = self._process_cap_feats_with_cfg_cache(
+            cap_feats_list, cap_pos_ids, cap_inner_pad_mask
+        )
+        x_freqs_cis_cat = self.rope_embedder(torch.cat(x_pos_ids, dim=0)) if x_pos_ids else torch.empty(0, device=device)
+        x_freqs_cis = pad_sequence(list(x_freqs_cis_cat.split(x_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(x_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(x_max_item_seqlen, device=device, dtype=torch.int32)
+        x_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        refiner_hints = None
+        if is_control_mode and self.is_two_stage_control:
+            prepared_control = self._prepare_control_inputs(control_context, cap_feats_padded, t, patch_size, f_patch_size, device)
+            c = prepared_control["c"]
+            kwargs_for_control_refiner = {
+                "x": x,
+                "attn_mask": prepared_control["attn_mask"],
+                "freqs_cis": prepared_control["freqs_cis"],
+                "adaln_input": prepared_control["adaln_input"],
+            }
+            c_processed = self._apply_transformer_blocks(
+                c,
+                self.control_noise_refiner if self.add_control_noise_refiner else self.control_layers,
+                checkpoint_ratio=self.checkpoint_ratio,
+                **kwargs_for_control_refiner,
+            )
+            refiner_hints = torch.unbind(c_processed)[:-1]
+            control_context_processed = torch.unbind(c_processed)[-1]
+            control_context_item_seqlens = prepared_control["c_item_seqlens"]
+        kwargs_for_refiner = {
+            "attn_mask": x_attn_mask,
+            "freqs_cis": x_freqs_cis,
+            "adaln_input": adaln_input,
+            "context_scale": refiner_conditioning_scale,
+        }
+        if refiner_hints is not None:
+            kwargs_for_refiner["hints"] = refiner_hints
+        x = self._apply_transformer_blocks(x, self.noise_refiner, checkpoint_ratio=1.0, **kwargs_for_refiner)
+        kwargs_for_context = {"attn_mask": cap_attn_mask, "freqs_cis": cap_freqs_cis}
+        cap_feats = self._apply_transformer_blocks(cap_feats_padded, self.context_refiner, checkpoint_ratio=1.0, **kwargs_for_context)
+        unified_item_seqlens = [a + b for a, b in zip(x_item_seqlens, cap_item_seqlens)]
+        unified_max_item_seqlen = max(unified_item_seqlens) if unified_item_seqlens else 0
+        unified = torch.zeros((bsz, unified_max_item_seqlen, x.shape[-1]), dtype=x.dtype, device=device)
+        unified_freqs_cis = torch.zeros((bsz, unified_max_item_seqlen, x_freqs_cis.shape[-2], x_freqs_cis.shape[-1]), dtype=x_freqs_cis.dtype, device=device)
+        for i in range(bsz):
+            x_len = x_item_seqlens[i]
+            cap_len = cap_item_seqlens[i]
+            unified[i, :x_len] = x[i, :x_len]
+            unified[i, x_len : x_len + cap_len] = cap_feats[i, :cap_len]
+            unified_freqs_cis[i, :x_len] = x_freqs_cis[i, :x_len]
+            unified_freqs_cis[i, x_len : x_len + cap_len] = cap_freqs_cis[i, :cap_len]
+        seq_lens_tensor = torch.tensor(unified_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(unified_max_item_seqlen, device=device, dtype=torch.int32)
+        unified_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        hints = None
+        if is_control_mode:
+            kwargs_for_hints = {
+                "attn_mask": unified_attn_mask,
+                "freqs_cis": unified_freqs_cis,
+                "adaln_input": adaln_input,
+            }
+            if self.is_two_stage_control:
+                control_context_unified_list = [
+                    torch.cat([control_context_processed[i][: control_context_item_seqlens[i]], cap_feats[i, : cap_item_seqlens[i]]], dim=0) for i in range(bsz)
+                ]
+                c = pad_sequence(control_context_unified_list, batch_first=True, padding_value=0.0)
+                new_kwargs = dict(x=unified, **kwargs_for_hints)
+                c_processed = self._apply_transformer_blocks(c, self.control_layers, checkpoint_ratio=self.checkpoint_ratio, **new_kwargs)
+                hints = torch.unbind(c_processed)[:-1]
+            else:
+                prepared_control = self._prepare_control_inputs(control_context, cap_feats_padded, t, patch_size, f_patch_size, device)
+                c = prepared_control["c"]
+                kwargs_for_v1_refiner = {
+                    "attn_mask": prepared_control["attn_mask"],
+                    "freqs_cis": prepared_control["freqs_cis"],
+                    "adaln_input": prepared_control["adaln_input"],
+                }
+                c = self._apply_transformer_blocks(c, self.control_noise_refiner, checkpoint_ratio=self.checkpoint_ratio, **kwargs_for_v1_refiner)
+                c_item_seqlens = prepared_control["c_item_seqlens"]
+                control_context_unified_list = [torch.cat([c[i, : c_item_seqlens[i]], cap_feats[i, : cap_item_seqlens[i]]], dim=0) for i in range(bsz)]
+                c_unified = pad_sequence(control_context_unified_list, batch_first=True, padding_value=0.0)
+                new_kwargs = dict(x=unified, **kwargs_for_hints)
+                c_processed = self._apply_transformer_blocks(c_unified, self.control_layers, checkpoint_ratio=self.checkpoint_ratio, **new_kwargs)
+                hints = torch.unbind(c_processed)[:-1]
+        kwargs_for_layers = {"attn_mask": unified_attn_mask, "freqs_cis": unified_freqs_cis, "adaln_input": adaln_input}
+        if hints is not None:
+            kwargs_for_layers["hints"] = hints
+            kwargs_for_layers["context_scale"] = conditioning_scale
+        unified = self._apply_transformer_blocks(unified, self.layers, checkpoint_ratio=self.checkpoint_ratio, **kwargs_for_layers)
+        unified_out = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
+        x_image_tokens = unified_out[:, :x_max_item_seqlen]
+        x_final_tensor = self._unpatchify(x_image_tokens, x_size, patch_size, f_patch_size)
+        return Transformer2DModelOutput(sample=x_final_tensor)

infer_controlnet.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import time
+from pathlib import Path
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
+from diffusers.utils import load_image
+from diffusers_local import patch # Apply necessary patches for local diffusers components
+# 1. Import all necessary components
+from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,garbage_collection_threshold:0.7,max_split_size_mb:1024"
+def main():
+    # 1. Set params
+    BASE_MODEL_ID = "."
+    GGUF_MODEL_FILE = "./transformer/z_image_turbo_control_unified_v2.1_q4_k_m.gguf"
+    use_gguf = True
+    control_mode = "depth"  # (pose, canny, depth, hed, mlsd)
+    negative_prompt = "low quality, blurry, ugly, deformed fingers, extra fingers, bad hand, bad anatomy, noise, overexposed, underexposed"
+    guidance_scale = 0
+    seed = 43
+    shift = 3.0
+    controlnet_conditioning_refiner_scale = None
+    if control_mode == "pose":
+        #prompt="一位年轻女子站在阳光明媚的海岸线上，白裙在轻拂的海风中微微飘动，裙摆轻盈飞扬。她拥有一头鲜艳的紫色长发，在风中轻盈舞动，发间系着一个精致的黑色蝴蝶结，与身后柔和的蔚蓝天空形成鲜明对比。她面容清秀，眉目精致，肤色白皙细腻，透着一股甜美的青春气息；神情柔和，略带羞涩，目光静静地凝望着远方的地平线，双手自然交叠于身前，手指清晰可见、五指完整、指节自然、姿势优雅放松，仿佛沉浸在思绪之中。背景是辽阔无垠、波光粼粼的大海，阳光洒在海面上，映出温暖的金色光晕，海浪轻轻拍打沙滩，天空湛蓝云朵稀薄。整体画面高清锐利、细节丰富、色彩鲜艳、焦点清晰、8K分辨率、杰作、最佳质量、无模糊、无噪点、无畸变、自然光照、电影级渲染。"
+        prompt = "Photorealistic portrait of a beautiful young East Asian woman with long, vibrant purple hair and a black bow. She is wearing a flowing white summer dress, standing on a sunny beach with a sparkling ocean and clear blue sky in the background. Bright natural sunlight, sharp focus, ultra-detailed."
+        control_image = load_image("assets/pose.jpg")
+        target_height = 1728
+        target_width = 992
+        num_inference_steps = 25
+        controlnet_conditioning_scale = 0.75
+        controlnet_conditioning_refiner_scale = 1.0
+        #guidance_scale = 2.5
+    elif control_mode == "canny":
+        prompt = "A jaguar in the forest, soft cinematic lighting, balanced exposure, 8K UHD, DSLR camera, sharp focus, realistic texture."
+        prompt = "A masterpiece photograph, the face of a stunning leopard in a moment of calm intensity. It is peeking from a hideaway of dark green ivy leaves and tiny jasmine flowers. Its amber-colored eyes are the focal point, locked onto the viewer with a piercing intelligence. The light is cinematic, soft and directional, sculpting the animal's features and highlighting the velvety texture of its fur and the wet gleam of its nose. Intimate and silent atmosphere. 4K quality, ultra-realistic."
+        control_image = load_image("assets/canny.jpg")
+        target_height = 1328
+        target_width = 880
+        num_inference_steps = 25
+        controlnet_conditioning_scale = 1.0
+    elif control_mode == "depth":
+        prompt = "Photorealistic portrait of a fluffy long-haired cat, sitting in a forest at night. The cat is in the foreground, close-up, and in sharp focus. The background with trees is heavily blurred, creating a strong bokeh effect. Soft lighting from the front illuminates the cat."
+        control_image = load_image("assets/depth_cat.png")
+        target_height = 1024
+        target_width = 1024
+        num_inference_steps = 15
+        controlnet_conditioning_scale = 0.7
+        guidance_scale = 1.5
+    elif control_mode == "hed":
+        # prompt="raw photo, portrait of a handsome Asian man sitting at a wooden table, holding a green glass bottle, wearing a black sweater, wristwatch, highly detailed skin texture, realistic pores, serious gaze, soft cinematic lighting, rim lighting, balanced exposure, 8k uhd, dslr, sharp focus, wood grain texture."
+        prompt = "Cinematic film still, an ultra-realistic portrait of a melancholic Korean man in a dimly lit room. He is sitting at a dark wooden table, his hands wrapped around a green soju bottle. Rembrandt-style lighting, with a soft key light from the side sculpting his features and casting the other side in deep shadow (chiaroscuro). Sharp focus on his weary, expressive eyes. Shallow depth of field, with the dark background blurred out. Subtle film grain, art-house cinema aesthetic."
+        negative_prompt = "underexposed, crushed blacks, too dark, heavy shadows, makeup, smooth skin, plastic, wax, cartoon, illustration, distorted hands, bad anatomy, blur, haze, flat lighting"
+        control_image = load_image("assets/man_hed.png")
+        target_height = 1024
+        target_width = 768
+        num_inference_steps = 25
+        controlnet_conditioning_scale = 0.7
+        guidance_scale = 2.5
+    elif control_mode == "mlsd":
+        prompt = "RAW photo, professional interior design photography of a bright and clean contemporary home office. A sleek white desk with distressed wood grain drawers and chrome handles sits before a large window. A modern white ergonomic chair. To the left, a tall built-in white bookshelf with warm, backlit shelves. A dark wood accent wall. Cozy beige chaise lounge with a decorative red pillow."
+        control_image = load_image("assets/room_mlsd.png")
+        target_height = 1024
+        target_width = 1024
+        num_inference_steps = 25
+        controlnet_conditioning_scale = 0.85
+        controlnet_conditioning_refiner_scale = 1.0
+    generator = torch.Generator("cuda").manual_seed(seed)
+    print("Loading Pipeline...")
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
+    if use_gguf:
+        transformer = ZImageControlTransformer2DModel.from_single_file(
+            GGUF_MODEL_FILE,
+            torch_dtype=torch.bfloat16,
+            config=str(Path(GGUF_MODEL_FILE).parent),
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+            add_control_noise_refiner=True,  # <== If you don't want to use the control noise refiner disable here.
+        )
+    else:
+        transformer = ZImageControlTransformer2DModel.from_pretrained(
+            BASE_MODEL_ID,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            add_control_noise_refiner=True,  # <== If you don't want to use the control noise refiner disable here.
+        )
+    pipe = ZImageControlUnifiedPipeline.from_pretrained(
+        BASE_MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        transformer=transformer,  # You don't need to load the transformer here if you don't intend to disable add_control_noise_refiner (only for diffusers model, gguf is required).
+    )
+    pipe.scheduler = scheduler
+    # Apply optimization (Optional)
+    pipe.enable_group_offload(
+        onload_device="cuda",
+        offload_device="cpu",
+        offload_type="block_level",
+        num_blocks_per_group=1,
+        low_cpu_mem_usage=True,
+        use_stream=True,
+    )
+    pipe.vae.use_tiling = True
+    # ---
+    print("\nRunning Inference...")
+    start_inference_time = time.time()
+    generated_image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        control_image=control_image,
+        height=target_height,
+        width=target_width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        controlnet_refiner_conditioning_scale=controlnet_conditioning_refiner_scale,
+        generator=generator,
+    ).images[0]
+    end_inference_time = time.time()
+    print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
+    # Save Output
+    if not os.path.exists("outputs"):
+        os.makedirs("outputs")
+    output_filename = f"outputs/z_image_controlnet_result_control_{control_mode}.png"
+    generated_image.save(output_filename)
+    print(f"Image successfully saved as '{output_filename}'")
+    generated_image.show()
+if __name__ == "__main__":
+    main()

infer_i2i.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import time
+from pathlib import Path
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
+from diffusers.utils import load_image
+from diffusers_local import patch # Apply necessary patches for local diffusers components
+# 1. Import all necessary components
+from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,garbage_collection_threshold:0.7,max_split_size_mb:1024"
+def main():
+    # 1. Set params
+    BASE_MODEL_ID = "."
+    GGUF_MODEL_FILE = "./transformer/z_image_turbo_control_unified_v2.1_q4_k_m.gguf"
+    use_gguf = True
+    prompt = "a asian man with a bottle"
+    negative_prompt = "Low quality, blurry, ugly, deformed fingers, extra fingers, bad hand, bad anatomy, noise, overexposed, underexposed"
+    target_height = 1024
+    target_width = 768
+    num_inference_steps = 9
+    guidance_scale = 0
+    strength = 0.75
+    seed = 43
+    shift = 3.0
+    input_image = load_image("assets/bottle.jpg")
+    generator = torch.Generator("cuda").manual_seed(seed)
+    print("Loading Pipeline...")
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
+    if use_gguf:
+        transformer = ZImageControlTransformer2DModel.from_single_file(
+            GGUF_MODEL_FILE,
+            torch_dtype=torch.bfloat16,
+            config=str(Path(GGUF_MODEL_FILE).parent),
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+            use_controlnet=False,  # <== Disable control layers to inference speedy
+        )
+    else:
+        transformer = ZImageControlTransformer2DModel.from_pretrained(
+            BASE_MODEL_ID,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            use_controlnet=False,  # <== Disable control layers to inference speedy
+        )
+    pipe = ZImageControlUnifiedPipeline.from_pretrained(BASE_MODEL_ID, torch_dtype=torch.bfloat16, transformer=transformer)
+    pipe.scheduler = scheduler
+    # Apply optimization (Optional)
+    pipe.enable_group_offload(
+        onload_device="cuda", offload_device="cpu", offload_type="block_level", num_blocks_per_group=1, low_cpu_mem_usage=True, use_stream=True
+    )
+    pipe.vae.use_tiling = True
+    # ---
+    print("\nRunning Inference...")
+    start_inference_time = time.time()
+    generated_image = pipe(
+        prompt=prompt,
+        image=input_image,
+        strength=strength,
+        negative_prompt=negative_prompt,
+        height=target_height,
+        width=target_width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        generator=generator,
+    ).images[0]
+    end_inference_time = time.time()
+    print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
+    # Save Output
+    if not os.path.exists("outputs"):
+        os.makedirs("outputs")
+    output_filename = "outputs/z_image_controlnet_result_i2i.png"
+    generated_image.save(output_filename)
+    print(f"Image successfully saved as '{output_filename}'")
+    generated_image.show()
+if __name__ == "__main__":
+    main()

infer_inpaint.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import time
+from pathlib import Path
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
+from diffusers.utils import load_image
+from diffusers_local import patch # Apply necessary patches for local diffusers components
+# 1. Import all necessary components
+from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,garbage_collection_threshold:0.7,max_split_size_mb:1024"
+def main():
+    # 1. Set params
+    BASE_MODEL_ID = "."
+    GGUF_MODEL_FILE = "./transformer/z_image_turbo_control_unified_v2.1_q4_k_m.gguf"
+    use_gguf = True
+    # prompt="一位年轻女子站在阳光明媚的海岸线上，白裙在轻拂的海风中微微飘动，裙摆轻盈飞扬。她拥有一头鲜艳的紫色长发，在风中轻盈舞动，发间系着一个精致的黑色蝴蝶结，与身后柔和的蔚蓝天空形成鲜明对比。她面容清秀，眉目精致，肤色白皙细腻，透着一股甜美的青春气息；神情柔和，略带羞涩，目光静静地凝望着远方的地平线，双手自然交叠于身前，手指清晰可见、五指完整、指节自然、姿势优雅放松，仿佛沉浸在思绪之中。背景是辽阔无垠、波光粼粼的大海，阳光洒在海面上，映出温暖的金色光晕，海浪轻轻拍打沙滩，天空湛蓝云朵稀薄。整体画面高清锐利、细节丰富、色彩鲜艳、焦点清晰、8K分辨率、杰作、最佳质量、无模糊、无噪点、无畸变、自然光照、电影级渲染。"
+    prompt = "Photorealistic portrait of a beautiful young East Asian woman with long, vibrant purple hair and a black bow. She is wearing a flowing white summer dress, standing on a sunny beach with a sparkling ocean and clear blue sky in the background. Bright natural sunlight, sharp focus, ultra-detailed."
+    negative_prompt = "Low quality, blurry, ugly, deformed fingers, extra fingers, bad hand, bad anatomy, noise, overexposed, underexposed"
+    target_height = 1728
+    target_width = 992
+    num_inference_steps = 20
+    guidance_scale = 0  # 2.5
+    controlnet_conditioning_scale = 0.7
+    controlnet_conditioning_refiner_scale = 0.75
+    mask_blur_radius = 8.0
+    seed = 42
+    shift = 3.0
+    generator = torch.Generator("cuda").manual_seed(seed)
+    print("Loading Pipeline...")
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
+    if use_gguf:
+        transformer = ZImageControlTransformer2DModel.from_single_file(
+            GGUF_MODEL_FILE,
+            torch_dtype=torch.bfloat16,
+            config=str(Path(GGUF_MODEL_FILE).parent),
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+            add_control_noise_refiner=True,  # <== If you don't want to use the control noise refiner disable here.
+        )
+    else:
+        transformer = ZImageControlTransformer2DModel.from_pretrained(
+            BASE_MODEL_ID,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            add_control_noise_refiner=True,  # <== If you don't want to use the control noise refiner disable here.
+        )
+    pipe = ZImageControlUnifiedPipeline.from_pretrained(
+        BASE_MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        transformer=transformer,  # You don't need to load the transformer here if you don't intend to disable add_control_noise_refiner (only for diffusers model, gguf is required).
+    )
+    pipe.scheduler = scheduler
+    # Apply optimization (Optional)
+    pipe.enable_group_offload(
+        onload_device="cuda", offload_device="cpu", offload_type="block_level", num_blocks_per_group=1, low_cpu_mem_usage=True, use_stream=True
+    )
+    pipe.vae.use_tiling = True
+    # ---
+    print("\nRunning Inference...")
+    pose_image = load_image("assets/pose.jpg")
+    inpaint_image = load_image("assets/inpaint.jpg")
+    mask_image = load_image("assets/mask_inpaint.jpg")
+    start_inference_time = time.time()
+    generated_image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        image=inpaint_image,
+        control_image=pose_image,
+        mask_image=mask_image,
+        mask_blur_radius=mask_blur_radius,
+        height=target_height,
+        width=target_width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        controlnet_refiner_conditioning_scale=controlnet_conditioning_refiner_scale,
+        generator=generator,
+    ).images[0]
+    end_inference_time = time.time()
+    print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
+    # Save Output
+    if not os.path.exists("outputs"):
+        os.makedirs("outputs")
+    output_filename = "outputs/z_image_controlnet_result_inpaint.png"
+    generated_image.save(output_filename)
+    print(f"Image successfully saved as '{output_filename}'")
+    generated_image.show()
+if __name__ == "__main__":
+    main()

infer_t2i.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import time
+from pathlib import Path
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
+from diffusers_local import patch # Apply necessary patches for local diffusers components
+# 1. Import all necessary components
+from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,garbage_collection_threshold:0.7,max_split_size_mb:1024"
+def main():
+    # 1. Set params
+    BASE_MODEL_ID = "."
+    GGUF_MODEL_FILE = "./z_image_turbo_control_unified_v2.1_q4_k_m.gguf"
+    use_gguf = True
+    prompt = "一位年轻女子站在阳光明媚的海岸线上，白裙在轻拂的海风中微微飘动，裙摆轻盈飞扬。她拥有一头鲜艳的紫色长发，在风中轻盈舞动，发间系着一个精致的黑色蝴蝶结，与身后柔和的蔚蓝天空形成鲜明对比。她面容清秀，眉目精致，肤色白皙细腻，透着一股甜美的青春气息；神情柔和，略带羞涩，目光静静地凝望着远方的地平线，双手自然交叠于身前，手指清晰可见、五指完整、指节自然、姿势优雅放松，仿佛沉浸在思绪之中。背景是辽阔无垠、波光粼粼的大海，阳光洒在海面上，映出温暖的金色光晕，海浪轻轻拍打沙滩，天空湛蓝云朵稀薄。整体画面高清锐利、细节丰富、色彩鲜艳、焦点清晰、8K分辨率、杰作、最佳质量、无模糊、无噪点、无畸变、自然光照、电影级渲染。"
+    negative_prompt = "Low quality, blurry, ugly, deformed fingers, extra fingers, bad hand, bad anatomy, noise, overexposed, underexposed"
+    target_height = 1728
+    target_width = 992
+    num_inference_steps = 9
+    guidance_scale = 0
+    seed = 43
+    shift = 3.0
+    generator = torch.Generator("cuda").manual_seed(seed)
+    print("Loading Pipeline...")
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=shift)
+    if use_gguf:
+        transformer = ZImageControlTransformer2DModel.from_single_file(
+            GGUF_MODEL_FILE,
+            torch_dtype=torch.bfloat16,
+            config=str(Path(GGUF_MODEL_FILE).parent),
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+            use_controlnet=False,  # <== Disable control layers to inference speedy
+        )
+    else:
+        transformer = ZImageControlTransformer2DModel.from_pretrained(
+            BASE_MODEL_ID,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            use_controlnet=False,  # <== Disable control layers to inference speedy
+        )
+    pipe = ZImageControlUnifiedPipeline.from_pretrained(BASE_MODEL_ID, torch_dtype=torch.bfloat16, transformer=transformer)
+    pipe.scheduler = scheduler
+    # Apply optimization (Optional)
+    pipe.enable_group_offload(
+        onload_device="cuda", offload_device="cpu", offload_type="block_level", num_blocks_per_group=1, low_cpu_mem_usage=True, use_stream=True
+    )
+    pipe.vae.use_tiling = True
+    # ---
+    print("\nRunning Inference...")
+    start_inference_time = time.time()
+    generated_image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        height=target_height,
+        width=target_width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        generator=generator,
+    ).images[0]
+    end_inference_time = time.time()
+    print(f"\nGeneration finished in {end_inference_time - start_inference_time:.2f} seconds.")
+    # Save Output
+    if not os.path.exists("outputs"):
+        os.makedirs("outputs")
+    output_filename = "outputs/z_image_controlnet_result_t2i.png"
+    generated_image.save(output_filename)
+    print(f"Image successfully saved as '{output_filename}'")
+    generated_image.show()
+if __name__ == "__main__":
+    main()

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "_class_name": "ZImagePipeline",
+    "_diffusers_version": "0.36.0.dev0",
+    "scheduler": [
+        "diffusers",
+        "FlowMatchEulerDiscreteScheduler"
+    ],
+    "text_encoder": [
+        "transformers",
+        "Qwen3Model"
+    ],
+    "tokenizer": [
+        "transformers",
+        "Qwen2Tokenizer"
+    ],
+    "transformer": [
+        "diffusers",
+        "ZImageControlTransformer2DModel"
+    ],
+    "vae": [
+        "diffusers",
+        "AutoencoderKL"
+    ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+--extra-index-url https://download.pytorch.org/whl/cu126
+torch==2.8.0+cu126
+torchvision==0.23.0+cu126
+torchaudio==2.8.0+cu126
+transformers==4.56.0
+bitsandbytes==0.48.1
+xformers==0.0.32.post2
+git+https://github.com/huggingface/diffusers
+hf_xet
+gguf
+accelerate
+protobuf
+einops
+matplotlib
+sacremoses
+scikit-image
+sentencepiece
+scipy
+opencv-python
+triton-windows<3.5; sys_platform == 'win32'
+triton==3.4.0; sys_platform != 'win32'