File size: 5,850 Bytes
55fe803
62594be
55fe803
 
9d9e3d4
 
bd4c365
9d9e3d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55fe803
 
9d9e3d4
55fe803
78bec1d
e24aa73
55fe803
6c5fa74
3e7b179
 
9d9e3d4
 
 
 
 
 
fa46cad
9d9e3d4
8927bdc
 
9d9e3d4
06484eb
 
fa757c4
 
 
 
 
06484eb
fa757c4
06484eb
 
 
8927bdc
06484eb
8927bdc
06484eb
 
 
 
 
9d9e3d4
 
 
fa757c4
06484eb
 
 
 
 
 
 
 
 
 
fa46cad
9d9e3d4
 
 
62594be
58264c9
9d9e3d4
 
 
 
06484eb
9d9e3d4
 
 
 
55fe803
 
9d9e3d4
 
 
58264c9
9d9e3d4
58264c9
9d9e3d4
 
8927bdc
9d9e3d4
55fe803
9d9e3d4
 
3ac4904
55fe803
 
 
 
3ac4904
55fe803
5812881
9d9e3d4
 
3ac4904
55fe803
8927bdc
9d9e3d4
8927bdc
 
 
06484eb
 
8927bdc
 
9d9e3d4
 
 
 
 
 
8927bdc
9d9e3d4
 
 
55fe803
9d9e3d4
 
 
 
 
55fe803
 
fa757c4
 
9d9e3d4
55fe803
 
9d9e3d4
58264c9
55fe803
9d9e3d4
 
 
 
 
 
 
 
 
 
55fe803
9d9e3d4
 
 
 
 
55fe803
9d9e3d4
 
 
 
 
 
55fe803
c96ee5c
9d9e3d4
 
556b962
 
58264c9
9d9e3d4
 
58264c9
55fe803
9d9e3d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import torch
import spaces
import gradio as gr
from diffusers import DiffusionPipeline
import diffusers
import io
import sys
import logging

# ------------------------
# GLOBAL LOG BUFFER
# ------------------------
log_buffer = io.StringIO()

def log(msg):
    print(msg)
    log_buffer.write(msg + "\n")

# Enable diffusers debug logs
diffusers.utils.logging.set_verbosity_info()

log("Loading Z-Image-Turbo pipeline...")

pipe = DiffusionPipeline.from_pretrained(
    "Tongyi-MAI/Z-Image-Turbo",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=False,
    attn_implementation="kernels-community/vllm-flash-attn3",
)

pipe.to("cuda")


#pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"] #spaces.aoti_blocks_load(pipe.transformer.layers, "zerogpu-aoti/Z-Image", variant="fa3")

# ------------------------
# ATTENTION + PIPE INFO
# ------------------------
def pipeline_debug_info(pipe):
    info = []
    info.append("=== PIPELINE DEBUG INFO ===")

    try:
        tr = pipe.transformer.config
        info.append(f"Transformer Class: {pipe.transformer.__class__.__name__}")

        # Z-Image-Turbo correct keys
        info.append(f"Hidden dim: {tr.get('hidden_dim')}")
        info.append(f"Attention heads: {tr.get('num_heads')}")
        info.append(f"Depth (layers): {tr.get('depth')}")
        info.append(f"Patch size: {tr.get('patch_size')}")
        info.append(f"MLP ratio: {tr.get('mlp_ratio')}")
        info.append(f"Attention backend: {tr.get('attn_implementation')}")
    except Exception as e:
        info.append(f"Transformer diagnostics failed: {e}")

    # VAE info
    try:
        vae = pipe.vae.config
        info.append(f"VAE latent channels: {vae.latent_channels}")
        info.append(f"VAE scaling factor: {vae.scaling_factor}")
    except Exception as e:
        info.append(f"VAE diagnostics failed: {e}")

    return "\n".join(info)


def latent_shape_info(h, w, pipe):
    try:
        c = pipe.vae.config.latent_channels
        s = pipe.vae.config.scaling_factor
        h_lat = int(h * s)
        w_lat = int(w * s)
        return f"Latent shape → ({c}, {h_lat}, {w_lat})"
    except Exception as e:
        return f"Latent shape calc failed: {e}"


# ------------------------
# IMAGE GENERATOR
# ------------------------
@spaces.GPU
def generate_image(prompt, height, width, num_inference_steps, seed, randomize_seed, num_images):
    log_buffer.truncate(0)
    log_buffer.seek(0)

    log("=== NEW GENERATION REQUEST ===")
    print(prompt)
    log(f"Height: {height}, Width: {width}")
    log(f"Inference Steps: {num_inference_steps}")
    log(f"Num Images: {num_images}")

    if randomize_seed:
        seed = torch.randint(0, 2**32 - 1, (1,)).item()
        log(f"Randomized Seed → {seed}")
    else:
        log(f"Seed: {seed}")

    # Clamp images
    num_images = min(max(1, int(num_images)), 3)

    # Debug pipe info
    log(pipeline_debug_info(pipe))

    generator = torch.Generator("cuda").manual_seed(int(seed))

    log("Running pipeline forward()...")
    result = pipe(
        prompt=prompt,
        height=int(height),
        width=int(width),
        num_inference_steps=int(num_inference_steps),
        guidance_scale=0.0,
        generator=generator,
        max_sequence_length=1024,
        num_images_per_prompt=num_images,
        output_type="pil",
    )

    # Correct latent diagnostics (Z-Image uses VAE + Transformer)
    try:
        log(f"VAE latent channels: {pipe.vae.config.latent_channels}")
        log(f"VAE scaling factor: {pipe.vae.config.scaling_factor}")
        log(f"Transformer latent size: {pipe.transformer.config.sample_size}")
        log(latent_shape_info(height, width, pipe))

    except Exception as e:
        log(f"Latent diagnostics error: {e}")

    log("Pipeline finished.")
    log("Returning images...")

    return result.images, seed, log_buffer.getvalue()


# ------------------------
# GRADIO UI
# ------------------------
examples = [
    ["Young Chinese woman in red Hanfu, intricate embroidery..."],
    ["A majestic dragon soaring through clouds at sunset..."],
    ["Cozy coffee shop interior, warm lighting, rain on windows..."],
    ["Astronaut riding a horse on Mars, cinematic lighting..."],
    ["Portrait of a wise old wizard..."],
]

with gr.Blocks(title="Z-Image-Turbo Multi Image Demo") as demo:
    gr.Markdown("# 🎨 Z-Image-Turbo — Multi Image ")

    with gr.Row():
        with gr.Column(scale=1):
            prompt = gr.Textbox(label="Prompt", lines=4)

            with gr.Row():
                height = gr.Slider(512, 2048, 1024, step=64, label="Height")
                width = gr.Slider(512, 2048, 1024, step=64, label="Width")

            num_images = gr.Slider(1, 3, 2, step=1, label="Number of Images")

            num_inference_steps = gr.Slider(
                1, 20, 9, step=1, label="Inference Steps",
                info="9 steps = 8 DiT forward passes",
            )

            with gr.Row():
                seed = gr.Number(label="Seed", value=42, precision=0)
                randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)

            generate_btn = gr.Button("🚀 Generate", variant="primary")

        with gr.Column(scale=1):
            output_images = gr.Gallery(label="Generated Images")
            used_seed = gr.Number(label="Seed Used", interactive=False)
            debug_log = gr.Textbox(
                label="Debug Log Output",
                lines=25,
                interactive=False
            )

    gr.Examples(examples=examples, inputs=[prompt], cache_examples=False)

    generate_btn.click(
        fn=generate_image,
        inputs=[prompt, height, width, num_inference_steps, seed, randomize_seed, num_images],
        outputs=[output_images, used_seed, debug_log],
    )

if __name__ == "__main__":
    demo.launch()