Spaces:

farjadmalik
/

fromWordsToMedia

Sleeping

App Files Files Community

farjadmalik commited on Aug 25, 2025

Commit

c6f85d1

1 Parent(s): a20609a

MVP application

Browse files

Files changed (11) hide show

.gitignore +15 -0
README.md +64 -14
app.py +69 -150
requirements.txt +6 -4
src/__init__.py +0 -0
src/text_synthesizer.py +121 -0
src/visual_synthesizer.py +95 -0
utils/__init__.py +0 -0
utils/config.py +24 -0
utils/helpers.py +86 -0
utils/logger.py +19 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Python virtual environment
+venv/
+.venv/
+env/
+.env/
+# Python cache
+__pycache__/
+*.pyc
+# Output files
+outputs/
+# Incomplete files
+src/audio_synthesizer.py

README.md CHANGED Viewed

@@ -1,14 +1,64 @@
----
-title: FromWordsToMedia
-emoji: 🖼
-colorFrom: purple
-colorTo: red
-sdk: gradio
-sdk_version: 5.25.2
-app_file: app.py
-pinned: false
-license: mit
-short_description: Generates an image and a caption for social media posts
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# From Words to Reels
+This project generates social media posts, including an image and a caption, from a user-provided text prompt. It leverages deep learning models for both text-to-image synthesis and text generation to create engaging content.
+## How it Works
+The process is orchestrated by the `main.py` script and follows these steps:
+1.  **User Input**: The script prompts the user to enter a text prompt.
+2.  **Image Generation**: The `VisualSynthesizer` takes the prompt, enhances it, and uses a text-to-image diffusion model (e.g., Stable Diffusion) to generate a corresponding image.
+3.  **Caption Generation**: The `TextSynthesizer` uses the original prompt to generate a suitable caption for the post using a causal language model.
+4.  **Output**: Both the generated image (`.png`) and the caption (`.txt`) are saved to the `outputs/` directory, prefixed with a timestamp.
+## Project Structure
+```
+.
+├── main.py                 # Main script to run the application
+├── README.md               # This file
+├── outputs/                # Directory for generated images and captions
+├── src/
+│   ├── visual_synthesizer.py # Handles image generation
+│   ├── text_synthesizer.py   # Handles text/caption generation
+└── utils/
+    ├── config.py             # Configuration for models and paths
+    └── helpers.py            # Helper functions for saving files etc.
+```
+## Setup and Installation
+1.  **Create a virtual environment:**
+    ```bash
+    python -m venv venv
+    venv\Scripts\activate
+    ```
+2.  **Install dependencies:**
+    Create a `requirements.txt` file with the following content:
+    ```
+    torch
+    diffusers
+    transformers
+    sentence-transformers
+    Pillow
+    accelerate
+    ```
+    Then run:
+    ```bash
+    pip install -r requirements.txt
+    ```
+## Usage
+To generate a post, run the `main.py` script:
+```bash
+python main.py
+```
+You will be prompted to enter your text. After processing, the generated image and caption will be saved in the `outputs` directory.
+## Configuration
+You can customize the models and other parameters by editing the `utils/config.py` file. This allows you to easily swap out different text-to-image or language models.

app.py CHANGED Viewed

@@ -1,154 +1,73 @@
 import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
         ],
-        outputs=[result, seed],
     )
-if __name__ == "__main__":
-    demo.launch()

+# External library imports
+from datetime import datetime
 import gradio as gr
+# Internal imports
+from src.visual_synthesizer import VisualSynthesizer
+from src.text_synthesizer import TextSynthesizer
+# from src.audio_synthesizer import AudioSynthesizer
+from utils.config import *
+# from utils.logger import setup_logger
+from utils.helpers import richify_prompt, save_caption, save_image
+def compose(prompt: str, filename: str = "generated_post"):
+    """
+    Main function to compose an Instagram post from a given prompt.
+    Args:
+        prompt (str): The text prompt to generate the Instagram post.
+    """
+    # Generate a timestamp for the filename
+    # This is useful for ensuring unique filenames and tracking when the post was created
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    filename = f"{timestamp}_{filename}"
+    # Initialize the visual synthesizer
+    image_gen = VisualSynthesizer()
+    # Generate the image
+    image = image_gen.generate_image(prompt=richify_prompt(prompt))
+    # Save the image
+    image_path = save_image(image, filename=filename)
+    print(f"Image saved at: {image_path}")
+    # Create a caption for the post
+    text_gen = TextSynthesizer()
+    caption = text_gen.generate_caption(prompt=prompt)
+    # Save the caption
+    caption_path = save_caption(caption, filename=filename)
+    print(f"Caption saved at: {caption_path}")
+    return image_path, caption
+if __name__ == '__main__':
+    iface = gr.Interface(
+        fn=compose,
+        inputs=gr.Textbox(lines=5, label="Prompt", placeholder="Enter your prompt here..."),
+        outputs=[
+            gr.Image(type="filepath", label="Generated Image"),
+            gr.Textbox(label="Generated Caption")
         ],
+        title="From Words to Reels",
+        description="Enter a prompt to generate an image and a corresponding social media caption.",
+        allow_flagging="never"
     )
+    # Launch the Gradio app
+    iface.launch()
+    # print(f"From words to reels, creates instagramable posts for your prompts")
+    # # setup_logger()
+    # input_prompt = input("Enter your prompt: ")
+    # if not input_prompt:
+    #     print("No prompt provided. Using default prompt.")
+    #     input_prompt = (
+    #         "Cosmos and the Universe, a vast expanse of stars and galaxies, "
+    #         "a reminder of our place in the universe. The beauty of the cosmos is "
+    #         "a source of inspiration and wonder, a reminder that we are part of something much larger than ourselves."
+    #         "The universe is a canvas, painted with the colors of creation, a masterpiece that continues to unfold before our eyes."
+    #     )
+    # input_prompt = "Indeed, with hardship comes ease. - Quran 94:6"
+    # # Compose a post given the prompt
+    # compose(prompt=input_prompt)
+    # print(f"Composition successfull. Check the output directory for the generated image and caption.")

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-accelerate
-diffusers
-invisible_watermark
 torch
 transformers
-xformers

+gradio
 torch
 transformers
+diffusers
+sentence-transformers
+pillow
+moviepy
+asyncio

src/__init__.py ADDED Viewed

File without changes

src/text_synthesizer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import re
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from sentence_transformers import SentenceTransformer
+# Importing the necessary configuration for model names
+from utils.config import TEXT_MODEL_NAME, EMBEDDING_MODEL_NAME
+class TextSynthesizer:
+    def __init__(self, embed_model=EMBEDDING_MODEL_NAME, text_model=TEXT_MODEL_NAME): # TinyLlama/TinyLlama-1.1B-Chat-v1.0 microsoft/phi-2 HuggingFaceH4/zephyr-1.1B-alpha model_name: str = "gpt-3.5-turbo"
+        """
+        Initializes the TextAnalyzer with a specified sentence-transformer model.
+        Args:
+            model_name (str): The name of the sentence-transformer model to use.
+        """
+        self.model = SentenceTransformer(embed_model)
+        self.tokenizer = AutoTokenizer.from_pretrained(text_model)
+        self.text_model = AutoModelForCausalLM.from_pretrained(
+            text_model,
+            device_map="auto",
+            torch_dtype="auto"
+        )
+        self.text_generator = pipeline(
+            'text-generation',
+            model=text_model,
+            tokenizer=self.tokenizer
+        )
+    def get_embedding(self, text: str):
+        """
+        Generates an embedding for the input text.
+        Args:
+            text (str): The input text (e.g., a quote, poem, or verse).
+        Returns:
+            numpy.ndarray: A vector embedding of the text.
+        """
+        return self.model.encode(text)
+    def clean_text(self, text: str) -> str:
+        """
+        Clean and normalize input text by removing unwanted characters and trimming extra whitespace.
+        """
+        # Remove any characters except letters, numbers, punctuation, and basic symbols
+        cleaned = re.sub(r"[^\w\s.,:;!?'\"-]", "", text)
+        # Normalize whitespace to single spaces
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        return cleaned
+    def validate_text_length(self, text: str, max_length: int = 300) -> bool:
+        """
+        Validate text length to avoid overly long inputs.
+        Returns True if valid, False otherwise.
+        """
+        return 0 < len(text) <= max_length
+    def extract_keywords(self, text: str) -> list:
+        """
+        Extracts keywords from the given text.
+        This is a placeholder and can be enhanced with more sophisticated NLP techniques
+        or another LLM for semantic keyword extraction.
+        Args:
+            text (str): The input text.
+        Returns:
+            list: A list of extracted keywords.
+        """
+        # Simple example: convert to lowercase and split by whitespace.
+        # For production, consider using NLTK, SpaCy, or an LLM-based keyword extractor.
+        return [word for word in text.lower().split() if len(word) > 2] # Basic filtering
+    def generate_caption(self, prompt: str, max_new_tokens: int = 300) -> str:
+        """
+        Generates a caption based on the provided prompt, enriched with poetic or authoritative quotes.
+        :param prompt: The input text prompt for text generation.
+        :param max_length: The maximum length of the generated caption.
+        :return: The generated caption.
+        """
+        # Craft a more detailed prompt for the model to generate a fitting caption
+        generation_prompt = (
+            f"Write a detailed, poetic, and informative paragraph about the following topic: \n'{prompt}'.\n"
+            f"Use vivid, emotional language and include relevant verses or quotes by poets, philosophers, or scientists."
+            f"The paragraph should be knowledgable, well researched and engaging. The tone should be educational and inspirational, not casual or conversational."
+            f"Dont use emojis or hastags or words like response or answer, just write the paragraph directly.\n"
+        )
+        generated_outputs = self.text_generator(
+            generation_prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            repetition_penalty=1.2,   # <-- helps reduce loops
+            num_return_sequences=1,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.eos_token_id
+        )
+        raw_output = generated_outputs[0]['generated_text']
+        # Remove the initial prompt from the result
+        if generation_prompt in raw_output:
+            # The model will return the prompt plus the generated text, so we clean it up.
+            # We find the generated part by removing the initial prompt.
+            caption = raw_output.split(generation_prompt)[-1].strip()
+        else:
+            caption = raw_output.strip()
+        # Clean up the caption to ensure it's a single coherent block
+        caption = caption.replace(generation_prompt, "").strip()
+        # Further clean the text to ensure it's a single, coherent block
+        # caption = self.clean_text(caption.split('\n')[0])
+        return caption

src/visual_synthesizer.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from diffusers.pipelines.auto_pipeline import AutoPipelineForText2Image
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils.export_utils import export_to_video
+from typing import Optional
+# Importing the model name from a configuration file
+# This allows for easy changes to the model without modifying the code
+# Ensure that the model_name is defined in utils/config.py
+from utils.config import IMG_MODEL_NAME, VIDEO_MODEL_NAME, OUTPUT_DIR
+class VisualSynthesizer:
+    def __init__(self,
+                 img_model: str = IMG_MODEL_NAME,
+                 video_model: str = VIDEO_MODEL_NAME):
+        """
+        Initializes the ImageGenerator with a specified text-to-image model.
+        Args:
+            img_model (str): The Hugging Face model ID for the diffusion model.
+            video_model (str): The Hugging Face model ID for the video generation model (if applicable).
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
+        torch.backends.cudnn.benchmark = True  # Optimize for input sizes
+        # Initialize text-to-image pipeline with the specified model
+        self.image_pipe = AutoPipelineForText2Image.from_pretrained(
+            img_model,
+            torch_dtype=self.torch_dtype,
+            variant="fp16" if self.torch_dtype == torch.float16 else None,
+            low_cpu_mem_usage=True
+        ).to(self.device)
+        # Initialize text-to-video pipeline
+        # self.video_pipe = DiffusionPipeline.from_pretrained(
+        #     video_model,
+        #     torch_dtype=self.torch_dtype,
+        #     variant="fp16" if self.torch_dtype == torch.float16 else None,
+        #     low_cpu_mem_usage=True
+        # ).to(self.device)
+        # self.video_pipe.enable_model_cpu_offload()
+    def generate_image(self, prompt: str,
+                       negative_prompt: str = "blurry, distorted, poorly drawn, watermark",
+                       num_inference_steps: int = 50, guidance_scale: float = 7.5):
+        image = self.image_pipe(prompt,
+                          negative_prompt=negative_prompt,
+                          num_inference_steps=num_inference_steps,
+                          guidance_scale=guidance_scale
+                          ).images[0]
+        return image
+    # TODO: Fix the video generation method use the correct pipeline and parameters
+    # This is a placeholder implementation, adjust as needed for your video generation requirements
+    def generate_video(
+        self,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        num_frames: int = 24,  # ~1 second at 24 fps
+        fps: int = 8,
+        output_path: Optional[str] = "output.mp4",
+        guidance_scale: float = 12.5,
+        num_inference_steps: int = 25
+    ) -> str: # type: ignore
+        """
+        Generates a short video from a text prompt.
+        Args:
+            prompt (str): Text prompt to guide generation.
+            negative_prompt (str): Optional negative prompts.
+            num_frames (int): Number of video frames.
+            fps (int): Frame rate for the video.
+            output_path (str): Path to save output video.
+            guidance_scale (float): Guidance scale for generation.
+            num_inference_steps (int): Number of inference steps.
+        Returns:
+            str: Path to saved video file.
+        """
+        # video_output = self.video_pipe(
+        #     prompt=prompt,
+        #     negative_prompt=negative_prompt,
+        #     num_frames=num_frames,
+        #     guidance_scale=guidance_scale,
+        #     num_inference_steps=num_inference_steps
+        # ).frames
+        # result = self.video_pipe(prompt, num_frames=num_frames, **kwargs)
+        # frames = result.frames[0]
+        # video_path = export_to_video(frames, output_video_path=f"{OUTPUT_DIR}_video", fps=fps)
+        # return video_path
+        pass

utils/__init__.py ADDED Viewed

File without changes

utils/config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Configuration variables to centralize parameters and paths.
+"""
+OUTPUT_DIR = "outputs/"  # Directory to save generated image and captions
+# Image generation settings
+IMAGE_SIZE = (512, 512)  # Size of the generated images
+# Model names for easy change and reuse
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+TEXT_MODEL_NAME = "microsoft/phi-2"
+AUDIO_MODEL_NAME = ""  # Placeholder for audio model, can be set later
+# Stable Diffusion model and device to run on
+IMG_MODEL_NAME = "runwayml/stable-diffusion-v1-5"
+VIDEO_MODEL_NAME = "cerspense/zeroscope_v2_XL"  # Placeholder for video model
+# Other models to try # Qwen/Qwen-Image # CompVis/stable-diffusion-v1-4
+# "segmind/SSD-1B" # Or "kandinsky-community/kandinsky-3", "warp-ai/wuerstchen"
+# Video generation models # cerspense/zeroscope_v2_576w # Wan‑Video/Wan2.1
+DEVICE = "cuda"  # Change to "cpu" if no GPU available
+# Font path for overlay text
+# FONT_PATH = "./fonts/arial.ttf"
+# FONT_SIZE = 40

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from datetime import datetime
+from utils.config import OUTPUT_DIR
+def save_caption(caption: str, filename: str, output_dir: str = OUTPUT_DIR):
+    """
+    Save the generated caption to a text file.
+    Args:
+        caption (str): The generated  text.
+        filename (str): Optional. The filename to use (without extension).
+        output_dir (str): Folder where the file will be saved. Defaults to 'outputs'.
+    Returns:
+        str: Full path to the saved file.
+    """
+    # Ensure the output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Generate filename if not provided
+    if not filename:
+        raise ValueError("Filename must be provided")
+    if not filename.endswith('.txt'):
+        filename += '.txt'
+    filepath = os.path.join(output_dir, filename)
+    # Save the caption
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(caption.strip())
+    return filepath
+def save_image(image, filename: str, output_dir: str = OUTPUT_DIR):
+    """
+    Saves the generated image to the specified directory.
+    Args:
+        image: The generated image to save.
+        output_dir (str): The directory where the image will be saved.
+        filename (str): The name of the file to save the image as.
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Generate filename if not provided
+    if not filename:
+        raise ValueError("Filename must be provided")
+    if not filename.endswith('.png'):
+        filename += '.png'
+    # Construct the full path
+    image_path = os.path.join(output_dir, filename)
+    image.save(image_path)
+    return image_path
+def richify_prompt(text: str) -> str:
+    """
+    Beautifies the input text by removing extra spaces and ensuring proper formatting.
+    Args:
+        text (str): The input text to be beautified.
+    Returns:
+        str: The beautified text.
+    """
+    if not isinstance(text, str):
+        raise ValueError("Input must be a string")
+    text_prompt = (
+        f"(best quality:1.3), (intricate details:1.2), high-resolution digital painting of {text}, "
+        "ArtStation fine art"
+    )
+    return ' '.join(text_prompt.split()).strip() if text_prompt else ''
+# More richify prompts
+# image_prompt = f"A beautiful and artistic representation of the following text: '{text}'; in the style of Studio Ghibli, digital art, 4k, vibrant colors, intricate details, Artstation."
+# Epic Cinematic Illustration
+# image_prompt = f"(best quality:1.4), (masterpiece:1.3), (detailed:1.2), 4k, wide-angle cosmic panorama of the Big Bang and expanding universe transitioning into the creation of life on Earth, poetic illumination, vibrant nebulae and galaxies, in the style of Studio Ghibli and ArtStation concept art, divine origins, dramatic lighting, awe‑inspiring mood"
+# Realistic Documentary Style
+# image_prompt = f"(realistic cosmic time-lapse:1.2), (masterpiece:1.2), ultra-detailed 8k scientific illustration of cosmic evolution from the Big Bang to modern civilization, expanding galaxies, formation of Earth, emergence of life, soft ambient lighting, realistic textures, wide-angle shot, inspired by ArtStation and nature documentaries, contemplative mood"
+# Animated Spiritual Universe
+# image_prompt = f"(best quality:1.3), (intricate details:1.2), high-resolution digital painting of the universe expanding from the Big Bang into Earth’s formation, evolving life and early civilization, soft celestial lighting, pastel and vibrant colors, in the style of Studio Ghibli animation, ArtStation fine art, uplifting and mystical atmosphere, panoramic composition"

utils/logger.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import logging
+import os
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+def get_logger(name: str) -> logging.Logger:
+    logger = logging.getLogger(name)
+    if not logger.hasHandlers():
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(LOG_FORMAT)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(LOG_LEVEL)
+    return logger
+# Example usage:
+# logger = get_logger(__name__)
+# logger.info("Logger initialized.")