tts

Sleeping

App Files Files Community

hadadrjt commited on Jan 14

Commit

906428b

0 Parent(s):

Pocket TTS: Initial experimental.

Browse files

Files changed (4) hide show

Dockerfile +10 -0
LICENSE +13 -0
README.md +11 -0
app.py +1053 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+#
+# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
+# SPDX-License-Identifier: Apache-2.0
+#
+FROM hadadrjt/pocket-tts:hf
+WORKDIR /app
+COPY app.py .

LICENSE ADDED Viewed

	@@ -0,0 +1,13 @@

+Copyright (c) 2025 Hadad <hadad@linuxmail.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: kyutai/pocket-tts
+short_description: Pocket TTS optimized for Hugging Face Spaces on CPU
+license: apache-2.0
+emoji: ⚡
+colorFrom: gray
+colorTo: yellow
+sdk: docker
+app_port: 7860
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,1053 @@

+"""
+============================================================================
+AI-GENERATED CODE
+============================================================================
+"""
+"""
+Pocket TTS Web Application
+==========================
+A Gradio-based web interface for the Pocket TTS text-to-speech model.
+This application provides an intuitive interface for generating speech
+from text using either preset voices or voice cloning capabilities.
+Features:
+---------
+- Multiple preset voice options
+- Voice cloning from uploaded audio files
+- Configurable generation parameters (temperature, LSD steps, etc.)
+- Real-time character counting and validation
+- Temporary file management with automatic cleanup
+- Thread-safe generation state management
+Usage:
+------
+Run this script directly to launch the web application:
+    $ python app.py
+The application will be available at http://localhost:7860
+"""
+import os
+import time
+import torch
+import tempfile
+import threading
+import scipy.io.wavfile
+import gradio as gr
+from pocket_tts import TTSModel
+# =============================================================================
+# ENVIRONMENT CONFIGURATION
+# =============================================================================
+# Configure PyTorch threading behavior
+torch.set_num_threads(2)           # Intra-op parallelism threads
+torch.set_num_interop_threads(2)   # Inter-op parallelism threads
+# =============================================================================
+# APPLICATION CONSTANTS
+# =============================================================================
+# Define all configurable constants and default values used throughout
+# the application. These values control model behavior, UI constraints,
+# and resource management policies.
+# Available preset voice options for speech generation
+AVAILABLE_VOICES = [
+    "alba",
+    "marius",
+    "javert",
+    "jean",
+    "fantine",
+    "cosette",
+    "eponine",
+    "azelma"
+]
+# Default configuration values
+DEFAULT_VOICE = "alba"                        # Default preset voice selection
+DEFAULT_MODEL_VARIANT = "b6369a24"            # Model variant identifier
+DEFAULT_TEMPERATURE = 0.7                     # Generation temperature
+DEFAULT_LSD_DECODE_STEPS = 1                  # Latent space decode steps
+DEFAULT_EOS_THRESHOLD = -4.0                  # End-of-sequence detection threshold
+DEFAULT_NOISE_CLAMP = 0.0                     # Noise clamping value (0 = disabled)
+DEFAULT_FRAMES_AFTER_EOS = 10                 # Additional frames after EOS
+# Input constraints and resource management
+MAXIMUM_INPUT_LENGTH = 1000                   # Maximum text input characters
+TEMPORARY_FILE_LIFETIME_SECONDS = 7200        # Temp file retention (2 hours)
+# Voice mode selection options
+VOICE_MODE_PRESET = "Preset Voices"           # Use predefined voice
+VOICE_MODE_CLONE = "Voice Cloning"            # Clone voice from audio
+# Example prompts with associated voice presets for demonstration
+EXAMPLE_PROMPTS_WITH_VOICES = [
+    {
+        "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
+        "voice": "alba"
+    },
+    {
+        "text": "Welcome to the future of text to speech technology powered by artificial intelligence.",
+        "voice": "marius"
+    },
+    {
+        "text": "Technology continues to push the boundaries of what we thought was possible.",
+        "voice": "javert"
+    },
+    {
+        "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.",
+        "voice": "fantine"
+    },
+    {
+        "text": "Science and innovation are transforming how we interact with the world around us.",
+        "voice": "jean"
+    }
+]
+# =============================================================================
+# THREAD SYNCHRONIZATION
+# =============================================================================
+# Global state management for thread-safe generation operations.
+# These locks and flags prevent concurrent generation requests and
+# enable graceful cancellation of ongoing operations.
+generation_state_lock = threading.Lock()   # Lock for generation state access
+is_currently_generating = False            # Flag indicating active generation
+stop_generation_requested = False          # Flag for stop request signaling
+# Temporary file registry for cleanup management
+temporary_files_registry = {}              # Maps file paths to creation timestamps
+temporary_files_lock = threading.Lock()    # Lock for registry access
+# =============================================================================
+# TEXT-TO-SPEECH MANAGER CLASS
+# =============================================================================
+class TextToSpeechManager:
+    """
+    Manages TTS model lifecycle and speech generation operations.
+    This class handles model loading, configuration caching, voice state
+    management, and audio generation. It implements lazy loading and
+    caching strategies to optimize performance and memory usage.
+    Attributes:
+        loaded_model: Currently loaded TTS model instance
+        current_configuration: Dict of current model configuration
+        voice_state_cache: Cache of computed voice states for preset voices
+    Example:
+        >>> manager = TextToSpeechManager()
+        >>> manager.load_or_get_model("b6369a24", 0.7, 1, None, -4.0)
+        >>> voice_state = manager.get_voice_state_for_preset("alba")
+        >>> audio = manager.generate_audio("Hello world", voice_state, 10, False)
+    """
+    def __init__(self):
+        """Initialize the TTS manager with empty state."""
+        self.loaded_model = None
+        self.current_configuration = {}
+        self.voice_state_cache = {}
+    def load_or_get_model(
+        self,
+        model_variant,
+        temperature,
+        lsd_decode_steps,
+        noise_clamp,
+        eos_threshold
+    ):
+        """
+        Load a TTS model or return cached instance if configuration matches.
+        This method implements lazy loading with configuration-based caching.
+        If the requested configuration differs from the currently loaded model,
+        a new model instance is created and the voice state cache is cleared.
+        Args:
+            model_variant: Model variant identifier string
+            temperature: Generation temperature (float, 0.1-2.0)
+            lsd_decode_steps: Number of LSD decode steps (int, 1-20)
+            noise_clamp: Maximum noise value or None to disable
+            eos_threshold: End-of-sequence detection threshold (float)
+        Returns:
+            TTSModel: Loaded and configured TTS model instance
+        """
+        # Process and validate input parameters with defaults
+        processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
+        processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
+        processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
+        processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
+        processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD
+        # Build configuration dictionary for comparison
+        requested_configuration = {
+            "variant": processed_variant,
+            "temp": processed_temperature,
+            "lsd_decode_steps": processed_lsd_steps,
+            "noise_clamp": processed_noise_clamp,
+            "eos_threshold": processed_eos_threshold
+        }
+        # Load new model if configuration changed or no model loaded
+        if self.loaded_model is None or self.current_configuration != requested_configuration:
+            self.loaded_model = TTSModel.load_model(**requested_configuration)
+            self.current_configuration = requested_configuration
+            self.voice_state_cache = {}  # Clear cache on model change
+        return self.loaded_model
+    def get_voice_state_for_preset(self, voice_name):
+        """
+        Get or compute voice state for a preset voice.
+        Voice states are cached to avoid redundant computation for
+        frequently used preset voices.
+        Args:
+            voice_name: Name of the preset voice (must be in AVAILABLE_VOICES)
+        Returns:
+            Voice state tensor for the specified preset voice
+        """
+        # Validate voice name and fall back to default if invalid
+        validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE
+        # Compute and cache voice state if not already cached
+        if validated_voice not in self.voice_state_cache:
+            self.voice_state_cache[validated_voice] = self.loaded_model.get_state_for_audio_prompt(
+                audio_conditioning=validated_voice,
+                truncate=False
+            )
+        return self.voice_state_cache[validated_voice]
+    def get_voice_state_for_clone(self, audio_file_path):
+        """
+        Compute voice state from an uploaded audio file for voice cloning.
+        Unlike preset voices, cloned voice states are not cached as they
+        are typically unique per request.
+        Args:
+            audio_file_path: Path to the uploaded audio file
+        Returns:
+            Voice state tensor extracted from the audio file
+        """
+        return self.loaded_model.get_state_for_audio_prompt(
+            audio_conditioning=audio_file_path,
+            truncate=False
+        )
+    def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
+        """
+        Generate speech audio from text using the specified voice state.
+        Args:
+            text_content: Text string to convert to speech
+            voice_state: Pre-computed voice state tensor
+            frames_after_eos: Number of frames to generate after EOS
+            enable_custom_frames: Whether to use custom frame count
+        Returns:
+            torch.Tensor: Generated audio waveform
+        """
+        # Apply custom frames setting if enabled
+        processed_frames = int(frames_after_eos) if enable_custom_frames else None
+        return self.loaded_model.generate_audio(
+            model_state=voice_state,
+            text_to_generate=text_content,
+            frames_after_eos=processed_frames,
+            copy_state=True
+        )
+    def save_audio_to_file(self, audio_tensor):
+        """
+        Save generated audio tensor to a temporary WAV file.
+        The file is registered for automatic cleanup after the configured
+        lifetime expires.
+        Args:
+            audio_tensor: PyTorch tensor containing audio waveform
+        Returns:
+            str: Path to the saved temporary WAV file
+        """
+        # Convert tensor to numpy array for scipy
+        audio_numpy_data = audio_tensor.numpy()
+        audio_sample_rate = self.loaded_model.sample_rate
+        # Create temporary file and write audio data
+        output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)
+        # Register file for cleanup tracking
+        with temporary_files_lock:
+            temporary_files_registry[output_file.name] = time.time()
+        return output_file.name
+# Create global TTS manager instance
+text_to_speech_manager = TextToSpeechManager()
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+def cleanup_expired_temporary_files():
+    """
+    Remove temporary files that have exceeded their lifetime.
+    This function is called periodically to prevent disk space exhaustion
+    from accumulated temporary audio files. Files older than
+    TEMPORARY_FILE_LIFETIME_SECONDS are removed from disk and registry.
+    """
+    current_timestamp = time.time()
+    expired_files = []
+    with temporary_files_lock:
+        # Identify expired files
+        for file_path, creation_timestamp in list(temporary_files_registry.items()):
+            if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
+                expired_files.append(file_path)
+        # Remove expired files from disk and registry
+        for file_path in expired_files:
+            try:
+                if os.path.exists(file_path):
+                    os.remove(file_path)
+                del temporary_files_registry[file_path]
+            except Exception:
+                pass  # Silently ignore deletion errors
+def validate_text_input(text_content):
+    """
+    Validate and clean text input for speech generation.
+    Args:
+        text_content: Raw text input from user
+    Returns:
+        tuple: (is_valid: bool, result: str)
+            - If valid: (True, cleaned_text)
+            - If invalid: (False, error_message or empty string)
+    """
+    # Check for None or non-string input
+    if not text_content or not isinstance(text_content, str):
+        return False, ""
+    # Clean whitespace
+    cleaned_text = text_content.strip()
+    # Check for empty content
+    if not cleaned_text:
+        return False, ""
+    # Check length constraint
+    if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
+        return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."
+    return True, cleaned_text
+def request_generation_stop():
+    """
+    Signal a request to stop the current generation.
+    Returns:
+        gr.update: Update to disable the stop button
+    """
+    global stop_generation_requested
+    stop_generation_requested = True
+    return gr.update(interactive=False)
+# =============================================================================
+# SPEECH GENERATION FUNCTION
+# =============================================================================
+def perform_speech_generation(
+    text_input,
+    voice_mode_selection,
+    voice_preset_selection,
+    voice_clone_audio_file,
+    model_variant,
+    lsd_decode_steps,
+    temperature,
+    noise_clamp,
+    eos_threshold,
+    frames_after_eos,
+    enable_custom_frames
+):
+    """
+    Perform the complete speech generation workflow.
+    This function orchestrates the entire generation process including:
+    validation, model loading, voice state preparation, audio generation,
+    and file saving. It handles thread safety and stop requests.
+    Args:
+        text_input: Text to convert to speech
+        voice_mode_selection: "Preset Voices" or "Voice Cloning"
+        voice_preset_selection: Selected preset voice name
+        voice_clone_audio_file: Path to uploaded audio for cloning
+        model_variant: Model variant identifier
+        lsd_decode_steps: Number of LSD decode steps
+        temperature: Generation temperature
+        noise_clamp: Noise clamping value
+        eos_threshold: End-of-sequence threshold
+        frames_after_eos: Frames to generate after EOS
+        enable_custom_frames: Whether to use custom frame count
+    Returns:
+        str or None: Path to generated audio file, or None if stopped
+    Raises:
+        gr.Error: On validation failure or generation error
+    """
+    global is_currently_generating, stop_generation_requested
+    # Run cleanup before starting new generation
+    cleanup_expired_temporary_files()
+    # Validate text input
+    is_valid, validation_result = validate_text_input(text_input)
+    if not is_valid:
+        if validation_result:
+            raise gr.Error(validation_result)
+        raise gr.Error("Please enter valid text to generate speech.")
+    # Validate voice cloning audio if in clone mode
+    if voice_mode_selection == VOICE_MODE_CLONE and not voice_clone_audio_file:
+        raise gr.Error("Please upload an audio file for voice cloning.")
+    # Acquire generation lock
+    with generation_state_lock:
+        if is_currently_generating:
+            raise gr.Error("A generation is already in progress. Please wait.")
+        is_currently_generating = True
+        stop_generation_requested = False
+    try:
+        # Load or retrieve cached model
+        text_to_speech_manager.load_or_get_model(
+            model_variant,
+            temperature,
+            lsd_decode_steps,
+            noise_clamp,
+            eos_threshold
+        )
+        # Check for stop request after model loading
+        if stop_generation_requested:
+            return None
+        # Prepare voice state based on mode
+        if voice_mode_selection == VOICE_MODE_CLONE:
+            voice_state = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
+        else:
+            voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
+        # Check for stop request after voice state preparation
+        if stop_generation_requested:
+            return None
+        # Generate audio from text
+        generated_audio = text_to_speech_manager.generate_audio(
+            validation_result,
+            voice_state,
+            frames_after_eos,
+            enable_custom_frames
+        )
+        # Check for stop request after generation
+        if stop_generation_requested:
+            return None
+        # Save audio to temporary file
+        output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio)
+        return output_file_path
+    except gr.Error:
+        raise
+    except Exception as generation_error:
+        raise gr.Error(f"Speech generation failed: {str(generation_error)}")
+    finally:
+        # Always release generation lock
+        with generation_state_lock:
+            is_currently_generating = False
+            stop_generation_requested = False
+# =============================================================================
+# UI STATE MANAGEMENT FUNCTIONS
+# =============================================================================
+def check_generate_button_state(text_content):
+    """
+    Update generate button interactivity based on text validity.
+    Args:
+        text_content: Current text input content
+    Returns:
+        gr.update: Update with interactive state
+    """
+    is_valid, _ = validate_text_input(text_content)
+    return gr.update(interactive=is_valid)
+def calculate_character_count_display(text_content):
+    """
+    Generate HTML for character count display with color coding.
+    Args:
+        text_content: Current text input content
+    Returns:
+        str: HTML string for character count display
+    """
+    character_count = len(text_content) if text_content else 0
+    # Use error color if over limit
+    display_color = (
+        "var(--error-text-color)"
+        if character_count > MAXIMUM_INPUT_LENGTH
+        else "var(--body-text-color-subdued)"
+    )
+    return f"<div style='text-align: right; padding: 4px 0;'><span style='color: {display_color}; font-size: 0.85em;'>{character_count} / {MAXIMUM_INPUT_LENGTH}</span></div>"
+def determine_clear_button_visibility_idle(text_content, audio_output):
+    """
+    Determine clear button visibility based on content state.
+    Args:
+        text_content: Current text input content
+        audio_output: Current audio output value
+    Returns:
+        gr.update: Update with visibility state
+    """
+    has_text_content = bool(text_content and text_content.strip())
+    has_audio_output = audio_output is not None
+    should_show_clear = has_text_content or has_audio_output
+    return gr.update(visible=should_show_clear)
+def update_voice_mode_visibility(voice_mode_value):
+    """
+    Update visibility of voice selection containers based on mode.
+    Args:
+        voice_mode_value: Selected voice mode
+    Returns:
+        tuple: (preset_container_update, clone_container_update)
+    """
+    if voice_mode_value == VOICE_MODE_CLONE:
+        return gr.update(visible=False), gr.update(visible=True)
+    else:
+        return gr.update(visible=True), gr.update(visible=False)
+def switch_to_generating_state():
+    """
+    Switch UI to generation-in-progress state.
+    Returns:
+        tuple: Updates for (generate_button, stop_button, clear_button)
+    """
+    return (
+        gr.update(visible=False),              # Hide generate button
+        gr.update(visible=True, interactive=True),  # Show stop button
+        gr.update(visible=False)               # Hide clear button
+    )
+def switch_to_idle_state(text_content, audio_output):
+    """
+    Switch UI back to idle state after generation.
+    Args:
+        text_content: Current text input content
+        audio_output: Current audio output value
+    Returns:
+        tuple: Updates for (generate_button, stop_button, clear_button)
+    """
+    has_text_content = bool(text_content and text_content.strip())
+    has_audio_output = audio_output is not None
+    should_show_clear = has_text_content or has_audio_output
+    return (
+        gr.update(visible=True),               # Show generate button
+        gr.update(visible=False),              # Hide stop button
+        gr.update(visible=should_show_clear)   # Show clear if content exists
+    )
+def perform_clear_action():
+    """
+    Clear all input and output fields.
+    Returns:
+        tuple: Reset values for all clearable components
+    """
+    return (
+        "",                      # Clear text input
+        None,                    # Clear audio output
+        gr.update(visible=False),  # Hide clear button
+        VOICE_MODE_PRESET,       # Reset voice mode
+        DEFAULT_VOICE,           # Reset voice preset
+        None                     # Clear clone audio
+    )
+# =============================================================================
+# EXAMPLE HANDLING FUNCTIONS
+# =============================================================================
+def create_example_handler(example_text, example_voice):
+    """
+    Create a handler function for example button clicks.
+    Args:
+        example_text: Example text to set
+        example_voice: Example voice to select
+    Returns:
+        function: Handler that sets example values
+    """
+    def set_example_values():
+        return example_text, VOICE_MODE_PRESET, example_voice
+    return set_example_values
+def format_example_button_label(example_text, example_voice, max_text_length=40):
+    """
+    Format example button label with voice and truncated text.
+    Args:
+        example_text: Full example text
+        example_voice: Voice name
+        max_text_length: Maximum text length before truncation
+    Returns:
+        str: Formatted button label
+    """
+    truncated_text = (
+        example_text[:max_text_length] + "..."
+        if len(example_text) > max_text_length
+        else example_text
+    )
+    return f"[{example_voice}] {truncated_text}"
+# =============================================================================
+# GRADIO APPLICATION DEFINITION
+# =============================================================================
+with gr.Blocks() as application:
+    # -------------------------------------------------------------------------
+    # SIDEBAR SECTION
+    # -------------------------------------------------------------------------
+    # Contains project information, description, and credits
+    with gr.Sidebar():
+        gr.HTML(
+            """
+            <h1>Audio Generation Playground part of the
+            <a href="https://huggingface.co/spaces/hadadxyz/ai" target="_blank">
+            Demo Playground</a>, and the
+            <a href="https://huggingface.co/umint" target="_blank">
+            UltimaX Intelligence</a> project.</h1><br />
+            This space runs the <b><a href="https://huggingface.co/kyutai/pocket-tts"
+            target="_blank">Pocket TTS</a></b> model from <b>Kyutai</b>.<br /><br />
+            A lightweight text-to-speech (TTS) application designed to run
+            efficiently on CPUs. Forget about the hassle of using GPUs and
+            web APIs serving TTS models.<br /><br />
+            Additionally, this space runs with a custom Docker image to
+            maximize the model's potential and has been optimized for the
+            limited scope of Hugging Face Spaces.<br /><br />
+            ⚠️ This space was created entirely by the
+            <b><a href="https://huggingface.co/hadadrjt/JARVIS" target="_blank">
+            J.A.R.V.I.S.</a></b> model operating in autonomous agent mode.
+            All code was generated by AI without human review.<br /><br />
+            This is an experimental space and is not part of production.
+            There may be minor bugs since the code was generated by AI.
+            However, none have been found so far.<br /><br />
+            If you find a bug, please report it in the community tab.<br /><br />
+            <b>Like this project? You can support me by buying a
+            <a href="https://ko-fi.com/hadad" target="_blank">coffee</a></b>
+            """
+        )
+    # -------------------------------------------------------------------------
+    # AUDIO OUTPUT SECTION
+    # -------------------------------------------------------------------------
+    audio_output_component = gr.Audio(
+        label="Generated Speech Output",
+        type="filepath",
+        interactive=False,
+        show_download_button=True
+    )
+    # -------------------------------------------------------------------------
+    # VOICE SELECTION SECTION
+    # -------------------------------------------------------------------------
+    with gr.Accordion("🎭 Voice Selection", open=True):
+        # Voice mode selector (preset vs cloning)
+        voice_mode_radio = gr.Radio(
+            label="Voice Mode",
+            choices=[VOICE_MODE_PRESET, VOICE_MODE_CLONE],
+            value=VOICE_MODE_PRESET,
+            info="Choose between preset voices or clone a voice from uploaded audio"
+        )
+        # Container for preset voice selection
+        with gr.Column(visible=True) as preset_voice_container:
+            voice_preset_dropdown = gr.Dropdown(
+                label="Select Preset Voice",
+                choices=AVAILABLE_VOICES,
+                value=DEFAULT_VOICE
+            )
+        # Container for voice cloning audio upload
+        with gr.Column(visible=False) as clone_voice_container:
+            voice_clone_audio_input = gr.Audio(
+                label="Upload Audio for Voice Cloning",
+                type="filepath"
+            )
+    # -------------------------------------------------------------------------
+    # GENERATION PARAMETERS SECTION
+    # -------------------------------------------------------------------------
+    with gr.Accordion("⚙️ Generation Parameters", open=False):
+        with gr.Row():
+            temperature_slider = gr.Slider(
+                label="Temperature",
+                minimum=0.1,
+                maximum=2.0,
+                step=0.05,
+                value=DEFAULT_TEMPERATURE,
+                info="Higher values produce more expressive speech"
+            )
+            lsd_decode_steps_slider = gr.Slider(
+                label="LSD Decode Steps",
+                minimum=1,
+                maximum=20,
+                step=1,
+                value=DEFAULT_LSD_DECODE_STEPS,
+                info="More steps may improve quality but slower"
+            )
+        with gr.Row():
+            noise_clamp_slider = gr.Slider(
+                label="Noise Clamp",
+                minimum=0.0,
+                maximum=2.0,
+                step=0.05,
+                value=DEFAULT_NOISE_CLAMP,
+                info="Maximum noise sampling value (0 = disabled)"
+            )
+            eos_threshold_slider = gr.Slider(
+                label="End of Sequence Threshold",
+                minimum=-10.0,
+                maximum=0.0,
+                step=0.25,
+                value=DEFAULT_EOS_THRESHOLD,
+                info="Smaller values cause earlier completion"
+            )
+    # -------------------------------------------------------------------------
+    # ADVANCED SETTINGS SECTION
+    # -------------------------------------------------------------------------
+    with gr.Accordion("🔧 Advanced Settings", open=False):
+        model_variant_textbox = gr.Textbox(
+            label="Model Variant Identifier",
+            value=DEFAULT_MODEL_VARIANT,
+            info="Model signature for generation"
+        )
+        with gr.Row():
+            enable_custom_frames_checkbox = gr.Checkbox(
+                label="Enable Custom Frames After EOS",
+                value=False,
+                info="Manually control post-EOS frame generation"
+            )
+            frames_after_eos_slider = gr.Slider(
+                label="Frames After EOS",
+                minimum=0,
+                maximum=100,
+                step=1,
+                value=DEFAULT_FRAMES_AFTER_EOS,
+                info="Additional frames after end-of-sequence (80ms per frame)"
+            )
+    # -------------------------------------------------------------------------
+    # TEXT INPUT SECTION
+    # -------------------------------------------------------------------------
+    text_input_component = gr.Textbox(
+        label="Prompt",
+        placeholder="Enter the text you want to convert to speech...",
+        lines=3,
+        max_lines=20,
+        max_length=MAXIMUM_INPUT_LENGTH,
+        autoscroll=True
+    )
+    # Character count display
+    character_count_display = gr.HTML(
+        f"<div style='text-align: right; padding: 4px 0;'><span style='color: var(--body-text-color-subdued); font-size: 0.85em;'>0 / {MAXIMUM_INPUT_LENGTH}</span></div>"
+    )
+    # -------------------------------------------------------------------------
+    # ACTION BUTTONS SECTION
+    # -------------------------------------------------------------------------
+    # Primary generate button
+    generate_button = gr.Button(
+        "🎙️ Generate Speech",
+        variant="primary",
+        size="lg",
+        interactive=False
+    )
+    # Stop button (visible during generation)
+    stop_button = gr.Button(
+        "⏹️ Stop Generation",
+        variant="stop",
+        size="lg",
+        visible=False
+    )
+    # Clear button (visible when content exists)
+    clear_button = gr.Button(
+        "🗑️ Clear",
+        variant="secondary",
+        size="lg",
+        visible=False
+    )
+    # -------------------------------------------------------------------------
+    # EXAMPLE PROMPTS SECTION
+    # -------------------------------------------------------------------------
+    gr.HTML("""
+        <div style="padding: 16px 0 8px 0;">
+            <h3 style="margin: 0 0 8px 0; font-size: 1.1em;">💡 Example Prompts</h3>
+            <p style="margin: 0; opacity: 0.7; font-size: 0.9em;">Click any example to generate speech with its assigned voice</p>
+        </div>
+    """)
+    # Create example buttons dynamically
+    example_buttons_list = []
+    with gr.Row():
+        example_button_0 = gr.Button(
+            format_example_button_label(
+                EXAMPLE_PROMPTS_WITH_VOICES[0]["text"],
+                EXAMPLE_PROMPTS_WITH_VOICES[0]["voice"]
+            ),
+            size="sm",
+            variant="secondary"
+        )
+        example_buttons_list.append(example_button_0)
+        example_button_1 = gr.Button(
+            format_example_button_label(
+                EXAMPLE_PROMPTS_WITH_VOICES[1]["text"],
+                EXAMPLE_PROMPTS_WITH_VOICES[1]["voice"]
+            ),
+            size="sm",
+            variant="secondary"
+        )
+        example_buttons_list.append(example_button_1)
+    with gr.Row():
+        example_button_2 = gr.Button(
+            format_example_button_label(
+                EXAMPLE_PROMPTS_WITH_VOICES[2]["text"],
+                EXAMPLE_PROMPTS_WITH_VOICES[2]["voice"]
+            ),
+            size="sm",
+            variant="secondary"
+        )
+        example_buttons_list.append(example_button_2)
+        example_button_3 = gr.Button(
+            format_example_button_label(
+                EXAMPLE_PROMPTS_WITH_VOICES[3]["text"],
+                EXAMPLE_PROMPTS_WITH_VOICES[3]["voice"]
+            ),
+            size="sm",
+            variant="secondary"
+        )
+        example_buttons_list.append(example_button_3)
+    with gr.Row():
+        example_button_4 = gr.Button(
+            format_example_button_label(
+                EXAMPLE_PROMPTS_WITH_VOICES[4]["text"],
+                EXAMPLE_PROMPTS_WITH_VOICES[4]["voice"]
+            ),
+            size="sm",
+            variant="secondary"
+        )
+        example_buttons_list.append(example_button_4)
+    # -------------------------------------------------------------------------
+    # EVENT HANDLERS AND BINDINGS
+    # -------------------------------------------------------------------------
+    # Define input components list for generation function
+    generation_inputs = [
+        text_input_component,
+        voice_mode_radio,
+        voice_preset_dropdown,
+        voice_clone_audio_input,
+        model_variant_textbox,
+        lsd_decode_steps_slider,
+        temperature_slider,
+        noise_clamp_slider,
+        eos_threshold_slider,
+        frames_after_eos_slider,
+        enable_custom_frames_checkbox
+    ]
+    # Voice mode change handler
+    voice_mode_radio.change(
+        fn=update_voice_mode_visibility,
+        inputs=[voice_mode_radio],
+        outputs=[preset_voice_container, clone_voice_container]
+    )
+    # Text input change handlers
+    text_input_component.change(
+        fn=calculate_character_count_display,
+        inputs=[text_input_component],
+        outputs=[character_count_display]
+    )
+    text_input_component.change(
+        fn=check_generate_button_state,
+        inputs=[text_input_component],
+        outputs=[generate_button]
+    )
+    text_input_component.change(
+        fn=determine_clear_button_visibility_idle,
+        inputs=[text_input_component, audio_output_component],
+        outputs=[clear_button]
+    )
+    # Audio output change handler
+    audio_output_component.change(
+        fn=determine_clear_button_visibility_idle,
+        inputs=[text_input_component, audio_output_component],
+        outputs=[clear_button]
+    )
+    # Generate button click handler chain
+    generate_button.click(
+        fn=switch_to_generating_state,
+        outputs=[generate_button, stop_button, clear_button]
+    ).then(
+        fn=perform_speech_generation,
+        inputs=generation_inputs,
+        outputs=[audio_output_component]
+    ).then(
+        fn=switch_to_idle_state,
+        inputs=[text_input_component, audio_output_component],
+        outputs=[generate_button, stop_button, clear_button]
+    ).then(
+        fn=check_generate_button_state,
+        inputs=[text_input_component],
+        outputs=[generate_button]
+    )
+    # Stop button handler
+    stop_button.click(
+        fn=request_generation_stop,
+        outputs=[stop_button]
+    )
+    # Clear button handler
+    clear_button.click(
+        fn=perform_clear_action,
+        outputs=[
+            text_input_component,
+            audio_output_component,
+            clear_button,
+            voice_mode_radio,
+            voice_preset_dropdown,
+            voice_clone_audio_input
+        ]
+    )
+    # Example button handlers
+    for button_index, example_button in enumerate(example_buttons_list):
+        example_text = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["text"]
+        example_voice = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["voice"]
+        example_button.click(
+            fn=create_example_handler(example_text, example_voice),
+            outputs=[text_input_component, voice_mode_radio, voice_preset_dropdown]
+        ).then(
+            fn=switch_to_generating_state,
+            outputs=[generate_button, stop_button, clear_button]
+        ).then(
+            fn=perform_speech_generation,
+            inputs=generation_inputs,
+            outputs=[audio_output_component]
+        ).then(
+            fn=switch_to_idle_state,
+            inputs=[text_input_component, audio_output_component],
+            outputs=[generate_button, stop_button, clear_button]
+        ).then(
+            fn=check_generate_button_state,
+            inputs=[text_input_component],
+            outputs=[generate_button]
+        )
+# =============================================================================
+# APPLICATION ENTRY POINT
+# =============================================================================
+if __name__ == "__main__":
+    application.launch(
+        server_name="0.0.0.0",
+        share=False
+    )