File size: 5,099 Bytes
8d35b9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
"""
ElevenLabs Voice Interface - For $2K + AirPods Pro Prize
Voice-first enterprise AI interaction.
"""
import os
from typing import Optional, AsyncGenerator
import asyncio
try:
from elevenlabs import ElevenLabs, VoiceSettings
from elevenlabs.client import AsyncElevenLabs
ELEVENLABS_AVAILABLE = True
except ImportError:
ELEVENLABS_AVAILABLE = False
print("[WARNING] ElevenLabs not installed")
class VoiceInterface:
"""
Voice-first interface for OmniMind using ElevenLabs.
Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)
- Natural conversational AI
- Streaming voice responses
- Enterprise-grade voice quality
"""
def __init__(self):
self.api_key = os.getenv("ELEVENLABS_API_KEY")
if not ELEVENLABS_AVAILABLE or not self.api_key:
self.client = None
print("[WARNING] ElevenLabs not configured")
return
self.client = AsyncElevenLabs(api_key=self.api_key)
# Voice configurations for different personas
self.voices = {
"professional": "ErXwobaYiN019PkySvjV", # Antoni - professional male
"friendly": "EXAVITQu4vr4xnSDxMaL", # Sarah - friendly female
"executive": "VR6AewLTigWG4xSOukaG", # Arnold - authoritative male
}
self.current_voice = "professional"
async def text_to_speech(
self,
text: str,
voice: str = "professional",
stream: bool = True
) -> AsyncGenerator[bytes, None]:
"""
Convert text to speech with streaming support.
Args:
text: Text to convert
voice: Voice persona (professional, friendly, executive)
stream: Stream audio chunks for real-time playback
Yields:
Audio chunks (bytes)
"""
if not self.client:
# Return empty generator if not configured
return
yield
voice_id = self.voices.get(voice, self.voices["professional"])
if stream:
# Streaming for real-time responses
audio_stream = await self.client.text_to_speech.convert_as_stream(
text=text,
voice_id=voice_id,
model_id="eleven_turbo_v2_5", # Fastest model
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
style=0.5,
use_speaker_boost=True
)
)
async for chunk in audio_stream:
yield chunk
else:
# Non-streaming for complete audio
audio = await self.client.text_to_speech.convert(
text=text,
voice_id=voice_id,
model_id="eleven_turbo_v2_5",
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
style=0.5,
use_speaker_boost=True
)
)
yield audio
async def speech_to_text(self, audio_data: bytes) -> str:
"""
Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).
Args:
audio_data: Audio bytes (WAV format)
Returns:
Transcribed text
"""
# ElevenLabs doesn't have STT, so we use OpenAI Whisper
from openai import AsyncOpenAI
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Save audio temporarily
import tempfile
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data)
audio_path = f.name
try:
with open(audio_path, "rb") as audio_file:
transcript = await openai_client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcript.text
finally:
# Cleanup
import os
os.unlink(audio_path)
async def get_available_voices(self):
"""Get list of available voices"""
if not self.client:
return {"status": "unavailable", "voices": []}
voices = await self.client.voices.get_all()
return {
"status": "success",
"voices": [
{
"voice_id": voice.voice_id,
"name": voice.name,
"category": voice.category
}
for voice in voices.voices
]
}
def set_voice(self, voice_name: str):
"""Set the current voice persona"""
if voice_name in self.voices:
self.current_voice = voice_name
return True
return False
# Global voice interface
voice = VoiceInterface()
|