File size: 5,099 Bytes
8d35b9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""

ElevenLabs Voice Interface - For $2K + AirPods Pro Prize



Voice-first enterprise AI interaction.

"""

import os
from typing import Optional, AsyncGenerator
import asyncio

try:
    from elevenlabs import ElevenLabs, VoiceSettings
    from elevenlabs.client import AsyncElevenLabs
    ELEVENLABS_AVAILABLE = True
except ImportError:
    ELEVENLABS_AVAILABLE = False
    print("[WARNING]  ElevenLabs not installed")


class VoiceInterface:
    """

    Voice-first interface for OmniMind using ElevenLabs.



    Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)

    - Natural conversational AI

    - Streaming voice responses

    - Enterprise-grade voice quality

    """

    def __init__(self):
        self.api_key = os.getenv("ELEVENLABS_API_KEY")

        if not ELEVENLABS_AVAILABLE or not self.api_key:
            self.client = None
            print("[WARNING]  ElevenLabs not configured")
            return

        self.client = AsyncElevenLabs(api_key=self.api_key)

        # Voice configurations for different personas
        self.voices = {
            "professional": "ErXwobaYiN019PkySvjV",  # Antoni - professional male
            "friendly": "EXAVITQu4vr4xnSDxMaL",  # Sarah - friendly female
            "executive": "VR6AewLTigWG4xSOukaG",  # Arnold - authoritative male
        }

        self.current_voice = "professional"

    async def text_to_speech(

        self,

        text: str,

        voice: str = "professional",

        stream: bool = True

    ) -> AsyncGenerator[bytes, None]:
        """

        Convert text to speech with streaming support.



        Args:

            text: Text to convert

            voice: Voice persona (professional, friendly, executive)

            stream: Stream audio chunks for real-time playback



        Yields:

            Audio chunks (bytes)

        """
        if not self.client:
            # Return empty generator if not configured
            return
            yield

        voice_id = self.voices.get(voice, self.voices["professional"])

        if stream:
            # Streaming for real-time responses
            audio_stream = await self.client.text_to_speech.convert_as_stream(
                text=text,
                voice_id=voice_id,
                model_id="eleven_turbo_v2_5",  # Fastest model
                voice_settings=VoiceSettings(
                    stability=0.5,
                    similarity_boost=0.75,
                    style=0.5,
                    use_speaker_boost=True
                )
            )

            async for chunk in audio_stream:
                yield chunk
        else:
            # Non-streaming for complete audio
            audio = await self.client.text_to_speech.convert(
                text=text,
                voice_id=voice_id,
                model_id="eleven_turbo_v2_5",
                voice_settings=VoiceSettings(
                    stability=0.5,
                    similarity_boost=0.75,
                    style=0.5,
                    use_speaker_boost=True
                )
            )

            yield audio

    async def speech_to_text(self, audio_data: bytes) -> str:
        """

        Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).



        Args:

            audio_data: Audio bytes (WAV format)



        Returns:

            Transcribed text

        """
        # ElevenLabs doesn't have STT, so we use OpenAI Whisper
        from openai import AsyncOpenAI

        openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        # Save audio temporarily
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            f.write(audio_data)
            audio_path = f.name

        try:
            with open(audio_path, "rb") as audio_file:
                transcript = await openai_client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file
                )

            return transcript.text

        finally:
            # Cleanup
            import os
            os.unlink(audio_path)

    async def get_available_voices(self):
        """Get list of available voices"""
        if not self.client:
            return {"status": "unavailable", "voices": []}

        voices = await self.client.voices.get_all()

        return {
            "status": "success",
            "voices": [
                {
                    "voice_id": voice.voice_id,
                    "name": voice.name,
                    "category": voice.category
                }
                for voice in voices.voices
            ]
        }

    def set_voice(self, voice_name: str):
        """Set the current voice persona"""
        if voice_name in self.voices:
            self.current_voice = voice_name
            return True
        return False


# Global voice interface
voice = VoiceInterface()