Spaces:

MCP-1st-Birthday
/

OmniMind-Orchestrator

Running

File size: 5,099 Bytes

8d35b9c

"""

ElevenLabs Voice Interface - For $2K + AirPods Pro Prize



Voice-first enterprise AI interaction.

"""

import os
from typing import Optional, AsyncGenerator
import asyncio

try:
    from elevenlabs import ElevenLabs, VoiceSettings
    from elevenlabs.client import AsyncElevenLabs
    ELEVENLABS_AVAILABLE = True
except ImportError:
    ELEVENLABS_AVAILABLE = False
    print("[WARNING]  ElevenLabs not installed")


class VoiceInterface:
    """

    Voice-first interface for OmniMind using ElevenLabs.



    Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)

    - Natural conversational AI

    - Streaming voice responses

    - Enterprise-grade voice quality

    """

    def __init__(self):
        self.api_key = os.getenv("ELEVENLABS_API_KEY")

        if not ELEVENLABS_AVAILABLE or not self.api_key:
            self.client = None
            print("[WARNING]  ElevenLabs not configured")
            return

        self.client = AsyncElevenLabs(api_key=self.api_key)

        # Voice configurations for different personas
        self.voices = {
            "professional": "ErXwobaYiN019PkySvjV",  # Antoni - professional male
            "friendly": "EXAVITQu4vr4xnSDxMaL",  # Sarah - friendly female
            "executive": "VR6AewLTigWG4xSOukaG",  # Arnold - authoritative male
        }

        self.current_voice = "professional"

    async def text_to_speech(

        self,

        text: str,

        voice: str = "professional",

        stream: bool = True

    ) -> AsyncGenerator[bytes, None]:
        """

        Convert text to speech with streaming support.



        Args:

            text: Text to convert

            voice: Voice persona (professional, friendly, executive)

            stream: Stream audio chunks for real-time playback



        Yields:

            Audio chunks (bytes)

        """
        if not self.client:
            # Return empty generator if not configured
            return
            yield

        voice_id = self.voices.get(voice, self.voices["professional"])

        if stream:
            # Streaming for real-time responses
            audio_stream = await self.client.text_to_speech.convert_as_stream(
                text=text,
                voice_id=voice_id,
                model_id="eleven_turbo_v2_5",  # Fastest model
                voice_settings=VoiceSettings(
                    stability=0.5,
                    similarity_boost=0.75,
                    style=0.5,
                    use_speaker_boost=True
                )
            )

            async for chunk in audio_stream:
                yield chunk
        else:
            # Non-streaming for complete audio
            audio = await self.client.text_to_speech.convert(
                text=text,
                voice_id=voice_id,
                model_id="eleven_turbo_v2_5",
                voice_settings=VoiceSettings(
                    stability=0.5,
                    similarity_boost=0.75,
                    style=0.5,
                    use_speaker_boost=True
                )
            )

            yield audio

    async def speech_to_text(self, audio_data: bytes) -> str:
        """

        Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).



        Args:

            audio_data: Audio bytes (WAV format)



        Returns:

            Transcribed text

        """
        # ElevenLabs doesn't have STT, so we use OpenAI Whisper
        from openai import AsyncOpenAI

        openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        # Save audio temporarily
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            f.write(audio_data)
            audio_path = f.name

        try:
            with open(audio_path, "rb") as audio_file:
                transcript = await openai_client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file
                )

            return transcript.text

        finally:
            # Cleanup
            import os
            os.unlink(audio_path)

    async def get_available_voices(self):
        """Get list of available voices"""
        if not self.client:
            return {"status": "unavailable", "voices": []}

        voices = await self.client.voices.get_all()

        return {
            "status": "success",
            "voices": [
                {
                    "voice_id": voice.voice_id,
                    "name": voice.name,
                    "category": voice.category
                }
                for voice in voices.voices
            ]
        }

    def set_voice(self, voice_name: str):
        """Set the current voice persona"""
        if voice_name in self.voices:
            self.current_voice = voice_name
            return True
        return False


# Global voice interface
voice = VoiceInterface()