| | import gradio as gr |
| | import torch |
| | from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline |
| | from PIL import Image |
| | import random |
| |
|
| | |
| | use_gpu = torch.cuda.is_available() |
| |
|
| | |
| | processor, model, zephyr_generator = None, None, None |
| |
|
| |
|
| | def load_models(): |
| | """Load models only when needed""" |
| | global processor, model, zephyr_generator |
| | if processor is None or model is None or zephyr_generator is None: |
| | print("Loading BLIP model...") |
| | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") |
| | model = BlipForConditionalGeneration.from_pretrained( |
| | "Salesforce/blip-image-captioning-large", |
| | torch_dtype=torch.float32 |
| | ) |
| | print("β
BLIP model loaded successfully!") |
| | print("Loading SARA-Zephyr fine-tuned model...") |
| | zephyr_generator = pipeline( |
| | "text-generation", |
| | model="Malaji71/SARA-Zephyr", |
| | torch_dtype=torch.float32, |
| | device_map="auto" if use_gpu else None |
| | ) |
| | print("β
SARA-Zephyr fine-tuned model loaded successfully!") |
| |
|
| |
|
| | |
| | unified_instructions = """ |
| | # π¬ Universal Video Prompting Guide |
| | *Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models* |
| | ## Core Principles (Universal) |
| | β
**Focus on MOTION, not static description** |
| | β
**Use positive phrasing exclusively** |
| | β
**Start simple, iterate progressively** |
| | β
**Refer to subjects in general terms** ("the subject," "the woman") |
| | β
**Keep prompts direct and easily understood** |
| | ## Two Complementary Approaches |
| | ### π **Gen-4 Official Method** (Recommended for beginners) |
| | **Structure**: Simple iterative building |
| | 1. Start with essential motion only |
| | 2. Add one element at a time: Subject Motion β Camera Motion β Scene Motion β Style Descriptors |
| | 3. Use general terms and avoid complex descriptions |
| | **Example**: |
| | - Basic: "The subject walks forward" |
| | - + Camera: "The subject walks forward. Handheld camera follows" |
| | - + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind" |
| | - + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic." |
| | ### π― **SARA Framework** (Advanced precision) |
| | **Structure**: [Subject] + [Action] + [Reference] + [Atmosphere] |
| | - **Subject (S)**: Main element to control |
| | - **Action (A)**: Movement/transformation ([verb] + [adverb]) |
| | - **Reference (R)**: Spatial anchors ("while X remains steady") |
| | - **Atmosphere (A)**: Context and style |
| | **Template**: [Subject] [verb] [adverb] while [reference] [atmosphere] |
| | **Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere" |
| | """ |
| |
|
| |
|
| | def analyze_image_with_zephyr(image): |
| | """Analyze image using BLIP + Zephyr AI for enhanced understanding""" |
| | if image is None: |
| | return "Please upload an image first.", {} |
| | try: |
| | |
| | load_models() |
| | |
| | if not isinstance(image, Image.Image): |
| | image = Image.fromarray(image) |
| | |
| | width, height = image.size |
| | aspect_ratio = width / height |
| | if aspect_ratio > 1.5: |
| | composition = "Wide landscape shot" |
| | elif aspect_ratio < 0.7: |
| | composition = "Vertical portrait shot" |
| | else: |
| | composition = "Balanced composition" |
| | |
| | inputs = processor(image, return_tensors="pt") |
| | out = model.generate(**inputs, max_length=50, num_beams=3) |
| | basic_caption = processor.decode(out[0], skip_special_tokens=True) |
| | |
| | enhanced_analysis = analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition) |
| | |
| | analysis = f"""π **Image Analysis:** |
| | β’ **Dimensions**: {width} x {height} |
| | β’ **Composition**: {composition} |
| | β’ **Aspect Ratio**: {aspect_ratio:.2f} |
| | π¨ **Scene Description**: |
| | "{basic_caption}" |
| | π€ **AI Enhanced Analysis**: |
| | {enhanced_analysis['scene_interpretation']} |
| | π‘ **Motion Insights**: |
| | {chr(10).join(f"β’ {insight}" for insight in enhanced_analysis['motion_insights'])} |
| | π― **Recommended Approach**: |
| | {enhanced_analysis['recommended_approach']}""" |
| | |
| | scene_info = { |
| | 'basic_description': basic_caption, |
| | 'composition': composition, |
| | 'aspect_ratio': aspect_ratio, |
| | 'enhanced_analysis': enhanced_analysis |
| | } |
| | return analysis, scene_info |
| | except Exception as e: |
| | return f"Error analyzing image: {str(e)}", {} |
| |
|
| |
|
| | def analyze_scene_with_zephyr(basic_caption, aspect_ratio, composition): |
| | """Use SARA-Zephyr for advanced scene analysis""" |
| | analysis_prompt = f"""<|system|> |
| | You are a video prompt engineering expert specializing in the SARA framework. Analyze this image description for video creation potential. |
| | <|user|> |
| | Image description: "{basic_caption}" |
| | Image composition: {composition} |
| | Aspect ratio: {aspect_ratio:.2f} |
| | Please provide: |
| | 1. Type of motion that would work best |
| | 2. Recommended camera movements |
| | 3. Emotional tone/style suggestions |
| | 4. Best prompting approach (SARA vs Gen-4) |
| | Be concise and practical. |
| | <|assistant|>""" |
| | response = zephyr_generator( |
| | analysis_prompt, |
| | max_new_tokens=200, |
| | do_sample=True, |
| | temperature=0.7, |
| | pad_token_id=zephyr_generator.tokenizer.eos_token_id |
| | ) |
| | ai_analysis = response[0]['generated_text'].split("<|assistant|>")[-1].strip() |
| | lines = ai_analysis.split('\n') |
| | motion_insights = [] |
| | recommended_approach = "SARA framework recommended for precise control" |
| | for line in lines: |
| | if line.strip(): |
| | if any(keyword in line.lower() for keyword in ['motion', 'movement', 'camera', 'lighting']): |
| | motion_insights.append(line.strip('- ').strip()) |
| | elif 'sara' in line.lower() or 'gen-4' in line.lower(): |
| | recommended_approach = line.strip('- ').strip() |
| | return { |
| | 'scene_interpretation': ai_analysis.split('\n')[0] if ai_analysis else "Scene analysis completed", |
| | 'motion_insights': motion_insights[:6], |
| | 'recommended_approach': recommended_approach |
| | } |
| |
|
| |
|
| | def generate_sample_prompts_with_zephyr(scene_info=None): |
| | """Generate sample prompts using SARA-Zephyr""" |
| | if scene_info and scene_info.get('basic_description'): |
| | |
| | context_prompt = f"""<|system|> |
| | Generate 3 professional video prompts using the SARA framework based on this image analysis. |
| | <|user|> |
| | Image description: {scene_info['basic_description']} |
| | Composition: {scene_info.get('composition', 'Balanced')} |
| | Aspect Ratio: {scene_info.get('aspect_ratio', 'N/A'):.2f} |
| | Remember the SARA framework: Subject + Action + Reference + Atmosphere |
| | <|assistant|>""" |
| | response = zephyr_generator( |
| | context_prompt, |
| | max_new_tokens=200, |
| | do_sample=True, |
| | temperature=0.8, |
| | pad_token_id=zephyr_generator.tokenizer.eos_token_id |
| | ) |
| | |
| | prompts_text = response[0]['generated_text'].split("<|assistant|>")[-1].strip() |
| | prompts = [p.strip('123.-β’ ') for p in prompts_text.split('\n') if p.strip()] |
| | |
| | if len(prompts) >= 3: |
| | return prompts[:3] |
| | |
| | base_prompts = [ |
| | "The subject walks forward smoothly while the background remains steady, cinematic atmosphere.", |
| | "A dramatic close-up captures the subject's expression as they speak directly to the camera.", |
| | "The scene transitions with a handheld camera following the subject through a bustling environment." |
| | ] |
| | return base_prompts |
| |
|
| |
|
| | def optimize_user_prompt_with_zephyr(user_idea, scene_info=None): |
| | """Optimize user's prompt idea using SARA-Zephyr while respecting SARA/Gen-4 structure""" |
| | if not user_idea.strip(): |
| | return "Please enter your idea first." |
| | |
| | context = "" |
| | if scene_info and scene_info.get('basic_description'): |
| | context = f"Image context: {scene_info['basic_description']}" |
| | |
| | optimization_prompt = f"""<|system|> |
| | You are an expert in video prompting, specializing in the SARA framework. Transform user ideas into professional prompts compatible with AI video models like Sora, Gen-4, Pika, Runway, and Luma. |
| | Key principles: |
| | - Focus on MOTION, not static description |
| | - Use positive phrasing |
| | - Be specific about camera work |
| | - Include lighting/atmosphere details |
| | - Follow the SARA structure: Subject + Action + Reference + Atmosphere |
| | <|user|> |
| | User's idea: "{user_idea}" |
| | {context} |
| | Please create an optimized video prompt using the SARA framework. Respond with just the prompt. |
| | <|assistant|>""" |
| | response = zephyr_generator( |
| | optimization_prompt, |
| | max_new_tokens=100, |
| | do_sample=True, |
| | temperature=0.7, |
| | pad_token_id=zephyr_generator.tokenizer.eos_token_id |
| | ) |
| | |
| | optimized = response[0]['generated_text'].split("<|assistant|>")[-1].strip() |
| | return optimized |
| |
|
| |
|
| | def refine_prompt_with_zephyr(current_prompt, feedback, chat_history, scene_info=None): |
| | """Refine a prompt based on user feedback using SARA-Zephyr""" |
| | if not feedback.strip(): |
| | return current_prompt, chat_history |
| | |
| | context = "" |
| | if scene_info and scene_info.get('basic_description'): |
| | context = f"Image context: {scene_info['basic_description']}" |
| | |
| | refinement_prompt = f"""<|system|> |
| | You are an expert in refining video prompts using the SARA framework. Based on the user's feedback, improve the current prompt while maintaining its core structure. |
| | Key principles: |
| | - Focus on MOTION, not static description |
| | - Use positive phrasing |
| | - Be specific about camera work |
| | - Include lighting/atmosphere details |
| | - Follow the SARA structure: Subject + Action + Reference + Atmosphere |
| | <|user|> |
| | Current prompt: "{current_prompt}" |
| | Feedback: "{feedback}" |
| | {context} |
| | Please refine the prompt while keeping it under 100 words. Respond with just the refined prompt. |
| | <|assistant|>""" |
| | response = zephyr_generator( |
| | refinement_prompt, |
| | max_new_tokens=100, |
| | do_sample=True, |
| | temperature=0.7, |
| | pad_token_id=zephyr_generator.tokenizer.eos_token_id |
| | ) |
| | |
| | refined = response[0]['generated_text'].split("<|assistant|>")[-1].strip() |
| | |
| | new_chat_history = chat_history + [[feedback, refined]] |
| | return refined, new_chat_history |
| |
|
| |
|
| | def generate_gen4_prompts(scene_info, foundation=""): |
| | """Generate Gen-4 style prompts iteratively""" |
| | try: |
| | if scene_info and scene_info.get('basic_description'): |
| | description = scene_info['basic_description'] |
| | |
| | if 'man' in description.lower(): |
| | subject = "The man" |
| | elif 'woman' in description.lower(): |
| | subject = "The woman" |
| | elif 'person' in description.lower(): |
| | subject = "The person" |
| | else: |
| | subject = "The subject" |
| | |
| | if any(word in description.lower() for word in ['sitting', 'seated']): |
| | actions = ['speaks to camera', 'gestures while seated', 'leans forward', 'adjusts posture'] |
| | elif any(word in description.lower() for word in ['standing', 'portrait']): |
| | actions = ['speaks directly', 'gestures naturally', 'shifts weight', 'looks around'] |
| | else: |
| | actions = ['moves forward', 'turns slightly', 'gestures', 'demonstrates'] |
| | action = random.choice(actions) |
| | |
| | basic = f"{subject} {action}" |
| | with_motion = f"{basic} smoothly" |
| | with_camera = f"{with_motion}. Camera captures steadily" |
| | |
| | composition = scene_info.get('composition', '') |
| | if 'Wide' in composition: |
| | style_addition = "Wide cinematic framing" |
| | elif 'Portrait' in composition: |
| | style_addition = "Intimate portrait lighting" |
| | else: |
| | style_addition = "Professional documentary style" |
| | with_style = f"{with_camera}. {style_addition}." |
| | return f"""π **Gen-4 Iterative Building:** |
| | **Basic**: {basic} |
| | **+ Motion**: {with_motion} |
| | **+ Camera**: {with_camera} |
| | **+ Style**: {with_style}""" |
| | else: |
| | return """π **Gen-4 Iterative Building:** |
| | **Basic**: The subject walks forward |
| | **+ Camera**: The subject walks forward. Handheld camera follows |
| | **+ Scene**: The subject walks forward. Handheld camera follows. Dust trails behind |
| | **+ Style**: The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic.""" |
| | except Exception as e: |
| | return f"Error generating Gen-4 prompts: {str(e)}" |
| |
|
| |
|
| | def build_custom_prompt(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"): |
| | """Build custom prompt using selected approach""" |
| | if approach == "SARA": |
| | |
| | parts = [] |
| | if foundation: |
| | parts.append(foundation) |
| | |
| | motion_parts = [] |
| | if subject_motion: |
| | motion_parts.extend(subject_motion) |
| | if scene_motion: |
| | motion_parts.extend(scene_motion) |
| | if motion_parts: |
| | parts.append(", ".join(motion_parts)) |
| | |
| | if camera_motion: |
| | parts.append(f"while {camera_motion}") |
| | else: |
| | parts.append("while background remains steady") |
| | |
| | if style: |
| | parts.append(style) |
| | return " ".join(parts) |
| | else: |
| | |
| | parts = [] |
| | if foundation: |
| | parts.append(foundation) |
| | if subject_motion: |
| | parts.extend(subject_motion) |
| | if camera_motion: |
| | parts.append(camera_motion) |
| | if scene_motion: |
| | parts.extend(scene_motion) |
| | if style: |
| | parts.append(style) |
| | return ". ".join(parts) if parts else "The subject moves naturally" |
| |
|
| |
|
| | |
| | def create_interface(): |
| | """Create the Gradio interface""" |
| | with gr.Blocks(theme=gr.themes.Soft(), title="AI Video Prompt Generator") as demo: |
| | |
| | gr.Markdown("# π¬ AI Video Prompt Generator - π€ SARA-Zephyr AI Powered") |
| | gr.Markdown("*Professional prompts for Sora, Gen-4, Pika, Luma, Runway and more*") |
| | |
| | scene_state = gr.State({}) |
| | chat_history_state = gr.State([]) |
| | with gr.Tabs(): |
| | |
| | with gr.Tab("π Prompting Guide"): |
| | gr.Markdown(unified_instructions) |
| | |
| | with gr.Accordion("π― Advanced Tips", open=False): |
| | gr.Markdown(""" |
| | ## Advanced Prompting Strategies |
| | ### π¨ Style Integration |
| | - **Cinematography**: "Dutch angle," "Extreme close-up," "Bird's eye view" |
| | - **Lighting**: "Golden hour," "Neon glow," "Harsh shadows," "Soft diffused light" |
| | - **Movement Quality**: "Fluid motion," "Mechanical precision," "Organic flow" |
| | ### β‘ Motion Types |
| | - **Subject Motion**: Walking, running, dancing, gesturing |
| | - **Camera Motion**: Pan, tilt, dolly, zoom, orbit, tracking |
| | - **Environmental**: Wind, water flow, particle effects, lighting changes |
| | """) |
| | |
| | with gr.Tab("π· Image Analysis"): |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | image_input = gr.Image( |
| | label="Upload Image for Analysis", |
| | type="pil" |
| | ) |
| | analyze_btn = gr.Button("π Analyze Image", variant="primary") |
| | with gr.Column(scale=2): |
| | analysis_output = gr.Markdown(label="AI Analysis Results") |
| | |
| | with gr.Group(): |
| | gr.Markdown("### π‘ Sample Prompts") |
| | sample_btn = gr.Button("π² Generate Sample Prompts") |
| | sample_prompts = [ |
| | gr.Textbox( |
| | label=f"Sample {i+1}", |
| | lines=2, |
| | interactive=False, |
| | show_copy_button=True |
| | ) |
| | for i in range(3) |
| | ] |
| | |
| | with gr.Tab("π€ AI Prompt Generator"): |
| | with gr.Row(): |
| | with gr.Column(): |
| | user_idea = gr.Textbox( |
| | label="Your Video Idea (any language)", |
| | placeholder="e.g., 'el personaje se quita la nariz' or 'character walks slowly'", |
| | lines=3 |
| | ) |
| | optimize_btn = gr.Button("π Generate Optimized Prompt", variant="primary") |
| | optimized_prompt = gr.Textbox( |
| | label="AI-Optimized Video Prompt", |
| | lines=4, |
| | interactive=True, |
| | show_copy_button=True |
| | ) |
| | with gr.Column(): |
| | gr.Markdown("### π Refine Your Prompt") |
| | feedback_input = gr.Textbox( |
| | label="Feedback/Changes", |
| | placeholder="e.g., 'make it more dramatic' or 'add camera movement'", |
| | lines=2 |
| | ) |
| | refine_btn = gr.Button("π Refine Prompt") |
| | |
| | with gr.Accordion("π¬ Refinement History", open=False): |
| | chat_display = gr.Chatbot(height=300, type='messages') |
| | |
| | with gr.Tab("π Gen-4 Official"): |
| | gr.Markdown("*Official Gen-4 method: Simple β Complex building*") |
| | with gr.Row(): |
| | foundation_gen4 = gr.Textbox( |
| | label="Foundation (Optional)", |
| | placeholder="e.g., 'The subject walks forward'", |
| | lines=1 |
| | ) |
| | generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary") |
| | gen4_output = gr.Textbox( |
| | label="Gen-4 Style Prompts", |
| | lines=8, |
| | interactive=False, |
| | show_copy_button=True |
| | ) |
| | |
| | with gr.Tab("π οΈ Custom Builder"): |
| | gr.Markdown("## Build Your Custom Prompt") |
| | with gr.Row(): |
| | approach_selector = gr.Radio( |
| | choices=["SARA", "Gen-4"], |
| | value="SARA", |
| | label="Approach", |
| | interactive=True |
| | ) |
| | custom_foundation = gr.Textbox( |
| | label="Foundation", |
| | placeholder="The subject...", |
| | lines=1 |
| | ) |
| | with gr.Row(): |
| | subject_motion = gr.CheckboxGroup( |
| | choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"], |
| | label="Subject Motion" |
| | ) |
| | scene_motion = gr.CheckboxGroup( |
| | choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"], |
| | label="Scene Motion" |
| | ) |
| | with gr.Row(): |
| | camera_motion = gr.Dropdown( |
| | choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"], |
| | label="Camera Motion", |
| | value="camera remains steady" |
| | ) |
| | style_motion = gr.Dropdown( |
| | choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"], |
| | label="Style/Atmosphere", |
| | value="cinematic" |
| | ) |
| | build_custom_btn = gr.Button("π¨ Build Custom Prompt", variant="secondary") |
| | custom_output = gr.Textbox( |
| | label="Your Custom Prompt", |
| | lines=3, |
| | interactive=True, |
| | show_copy_button=True |
| | ) |
| | |
| | analyze_btn.click( |
| | fn=analyze_image_with_zephyr, |
| | inputs=[image_input], |
| | outputs=[analysis_output, scene_state] |
| | ) |
| | sample_btn.click( |
| | fn=generate_sample_prompts_with_zephyr, |
| | inputs=[scene_state], |
| | outputs=sample_prompts |
| | ) |
| | optimize_btn.click( |
| | fn=optimize_user_prompt_with_zephyr, |
| | inputs=[user_idea, scene_state], |
| | outputs=[optimized_prompt] |
| | ) |
| | refine_btn.click( |
| | fn=refine_prompt_with_zephyr, |
| | inputs=[optimized_prompt, feedback_input, chat_history_state, scene_state], |
| | outputs=[optimized_prompt, chat_history_state] |
| | ) |
| | |
| | chat_history_state.change( |
| | fn=lambda history: history, |
| | inputs=[chat_history_state], |
| | outputs=[chat_display] |
| | ) |
| | generate_gen4_btn.click( |
| | fn=generate_gen4_prompts, |
| | inputs=[scene_state, foundation_gen4], |
| | outputs=[gen4_output] |
| | ) |
| | build_custom_btn.click( |
| | fn=build_custom_prompt, |
| | inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector], |
| | outputs=[custom_output] |
| | ) |
| | return demo |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | print("π¬ Starting AI Video Prompt Generator with SARA-Zephyr...") |
| | print(f"π Status: {'GPU' if use_gpu else 'CPU'} Mode Enabled") |
| | print("π§ Loading models (this may take a few minutes)...") |
| | try: |
| | demo = create_interface() |
| | print("β
Interface created successfully!") |
| | print("π Launching application...") |
| | demo.launch( |
| | share=True, |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | debug=True, |
| | show_error=True |
| | ) |
| | except Exception as e: |
| | print(f"β Error launching app: {e}") |
| | print("π§ Make sure you have sufficient CPU resources and all dependencies installed.") |
| | print("π¦ Required packages:") |
| | print(" pip install torch transformers gradio pillow accelerate bitsandbytes") |
| | |
| | print("\nπ Attempting alternative launch...") |
| | try: |
| | demo = create_interface() |
| | demo.launch( |
| | share=False, |
| | server_name="127.0.0.1", |
| | server_port=7860, |
| | debug=False |
| | ) |
| | except Exception as e2: |
| | print(f"β Alternative launch failed: {e2}") |
| | print("\nπ‘ Troubleshooting tips:") |
| | print("1. Ensure CPU resources are sufficient.") |
| | print("2. Check CPU usage: top or htop") |
| | print("3. Try reducing model precision: set torch_dtype=torch.float32") |
| | print("4. Monitor memory usage: free -h") |