""" Voice Handler Module Provides Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities with multiple provider options for different cost/quality tiers. """ import os import tempfile from abc import ABC, abstractmethod from pathlib import Path from typing import Optional, List, Dict import asyncio # Import voice processing libraries from openai import OpenAI import whisper import edge_tts from gtts import gTTS # ============================================================================ # Configuration and Cost Tiers # ============================================================================ class VoiceConfig: """Configuration for voice providers and their characteristics.""" # Language definitions with their codes and display names LANGUAGES = { "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Russian": "ru", "Chinese (Mandarin)": "zh", "Japanese": "ja", "Korean": "ko", "Arabic": "ar", "Hindi": "hi", "Turkish": "tr", "Polish": "pl", "Swedish": "sv", "Danish": "da", "Norwegian": "no", "Finnish": "fi", "Greek": "el", "Czech": "cs", "Romanian": "ro", "Hungarian": "hu", "Thai": "th", "Vietnamese": "vi", "Indonesian": "id", "Malay": "ms", "Filipino": "fil", "Hebrew": "he", "Ukrainian": "uk", } # Multilingual Edge TTS voices organized by language EDGE_TTS_VOICES = { "en": ["en-US-JennyNeural", "en-US-GuyNeural", "en-US-AriaNeural", "en-GB-SoniaNeural", "en-GB-RyanNeural", "en-AU-NatashaNeural"], "es": ["es-ES-ElviraNeural", "es-ES-AlvaroNeural", "es-MX-DaliaNeural", "es-MX-JorgeNeural", "es-AR-ElenaNeural"], "fr": ["fr-FR-DeniseNeural", "fr-FR-HenriNeural", "fr-CA-SylvieNeural", "fr-CA-AntoineNeural", "fr-BE-CharlineNeural"], "de": ["de-DE-KatjaNeural", "de-DE-ConradNeural", "de-AT-IngridNeural", "de-CH-LeniNeural"], "it": ["it-IT-ElsaNeural", "it-IT-DiegoNeural", "it-IT-IsabellaNeural"], "pt": ["pt-BR-FranciscaNeural", "pt-BR-AntonioNeural", "pt-PT-RaquelNeural", "pt-PT-DuarteNeural"], "nl": ["nl-NL-ColetteNeural", "nl-NL-MaartenNeural", "nl-BE-DenaNeural"], "ru": ["ru-RU-SvetlanaNeural", "ru-RU-DmitryNeural"], "zh": ["zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural", "zh-TW-HsiaoChenNeural", "zh-HK-HiuMaanNeural"], "ja": ["ja-JP-NanamiNeural", "ja-JP-KeitaNeural"], "ko": ["ko-KR-SunHiNeural", "ko-KR-InJoonNeural"], "ar": ["ar-SA-ZariyahNeural", "ar-SA-HamedNeural", "ar-EG-SalmaNeural"], "hi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"], "tr": ["tr-TR-EmelNeural", "tr-TR-AhmetNeural"], "pl": ["pl-PL-ZofiaNeural", "pl-PL-MarekNeural"], "sv": ["sv-SE-SofieNeural", "sv-SE-MattiasNeural"], "da": ["da-DK-ChristelNeural", "da-DK-JeppeNeural"], "no": ["nb-NO-PernilleNeural", "nb-NO-FinnNeural"], "fi": ["fi-FI-NooraNeural", "fi-FI-HarriNeural"], "el": ["el-GR-AthinaNeural", "el-GR-NestorasNeural"], "cs": ["cs-CZ-VlastaNeural", "cs-CZ-AntoninNeural"], "ro": ["ro-RO-AlinaNeural", "ro-RO-EmilNeural"], "hu": ["hu-HU-NoemiNeural", "hu-HU-TamasNeural"], "th": ["th-TH-PremwadeeNeural", "th-TH-NiwatNeural"], "vi": ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"], "id": ["id-ID-GadisNeural", "id-ID-ArdiNeural"], "ms": ["ms-MY-YasminNeural", "ms-MY-OsmanNeural"], "fil": ["fil-PH-BlessicaNeural", "fil-PH-AngeloNeural"], "he": ["he-IL-HilaNeural", "he-IL-AvriNeural"], "uk": ["uk-UA-PolinaNeural", "uk-UA-OstapNeural"], } # STT Provider definitions STT_PROVIDERS = { "OpenAI Whisper API": { "id": "openai_whisper", "cost_tier": "medium", "cost_per_minute": 0.006, "requires_api_key": True, }, "Local Whisper (Tiny)": { "id": "local_whisper_tiny", "cost_tier": "free", "cost_per_minute": 0.0, "requires_api_key": False, }, "Local Whisper (Base)": { "id": "local_whisper_base", "cost_tier": "free", "cost_per_minute": 0.0, "requires_api_key": False, }, } # TTS Provider definitions TTS_PROVIDERS = { "Edge-TTS (Free)": { "id": "edge_tts", "cost_tier": "free", "cost_per_1k_chars": 0.0, "requires_api_key": False, "voices": [] # Will be populated dynamically based on language }, "OpenAI TTS": { "id": "openai_tts", "cost_tier": "medium", "cost_per_1k_chars": 0.015, "requires_api_key": True, "voices": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] }, "gTTS (Free)": { "id": "gtts", "cost_tier": "free", "cost_per_1k_chars": 0.0, "requires_api_key": False, "voices": ["default"] }, } # Default selections DEFAULT_STT = "OpenAI Whisper API" DEFAULT_TTS = "gTTS (Free)" # More reliable on cloud platforms like HF Spaces DEFAULT_TTS_VOICE = "default" DEFAULT_LANGUAGE = "English" # ============================================================================ # Abstract Base Classes # ============================================================================ class STTProvider(ABC): """Abstract base class for Speech-to-Text providers.""" @abstractmethod def transcribe(self, audio_path: str) -> str: """ Transcribe audio file to text. Args: audio_path: Path to audio file Returns: Transcribed text """ pass class TTSProvider(ABC): """Abstract base class for Text-to-Speech providers.""" @abstractmethod def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """ Synthesize text to speech. Args: text: Text to convert to speech output_path: Optional path to save audio file Returns: Path to generated audio file """ pass @abstractmethod def get_available_voices(self) -> List[str]: """Get list of available voices for this provider.""" pass # ============================================================================ # STT Provider Implementations # ============================================================================ class OpenAIWhisperSTT(STTProvider): """OpenAI Whisper API implementation.""" def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or os.getenv("OPENAI_API_KEY") if not self.api_key: raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.") self.client = OpenAI(api_key=self.api_key) def transcribe(self, audio_path: str) -> str: """Transcribe audio using OpenAI Whisper API.""" try: with open(audio_path, "rb") as audio_file: transcript = self.client.audio.transcriptions.create( model="whisper-1", file=audio_file ) return transcript.text except Exception as e: raise Exception(f"OpenAI Whisper transcription failed: {str(e)}") class LocalWhisperSTT(STTProvider): """Local Whisper model implementation.""" def __init__(self, model_size: str = "base"): """ Initialize local Whisper model. Args: model_size: Model size (tiny, base, small, medium, large) """ self.model_size = model_size self.model = None def _load_model(self): """Lazy load the model.""" if self.model is None: self.model = whisper.load_model(self.model_size) def transcribe(self, audio_path: str) -> str: """Transcribe audio using local Whisper model.""" self._load_model() try: result = self.model.transcribe(audio_path) return result["text"] except Exception as e: raise Exception(f"Local Whisper transcription failed: {str(e)}") # ============================================================================ # TTS Provider Implementations # ============================================================================ class EdgeTTSProvider(TTSProvider): """Microsoft Edge TTS implementation (free).""" def __init__(self, voice: str = "en-US-JennyNeural"): self.voice = voice def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """Synthesize speech using Edge TTS.""" if output_path is None: output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3") try: # Edge TTS requires async async def _synthesize(): communicate = edge_tts.Communicate(text, self.voice) await communicate.save(output_path) asyncio.run(_synthesize()) return output_path except Exception as e: raise Exception(f"Edge TTS synthesis failed: {str(e)}") def get_available_voices(self) -> List[str]: """Get available Edge TTS voices.""" return VoiceConfig.TTS_PROVIDERS["Edge-TTS (Free)"]["voices"] class OpenAITTSProvider(TTSProvider): """OpenAI TTS implementation.""" def __init__(self, voice: str = "nova", api_key: Optional[str] = None): self.voice = voice self.api_key = api_key or os.getenv("OPENAI_API_KEY") if not self.api_key: raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.") self.client = OpenAI(api_key=self.api_key) def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """Synthesize speech using OpenAI TTS.""" if output_path is None: output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3") try: response = self.client.audio.speech.create( model="tts-1", voice=self.voice, input=text ) response.stream_to_file(output_path) return output_path except Exception as e: raise Exception(f"OpenAI TTS synthesis failed: {str(e)}") def get_available_voices(self) -> List[str]: """Get available OpenAI TTS voices.""" return VoiceConfig.TTS_PROVIDERS["OpenAI TTS"]["voices"] class GTTSProvider(TTSProvider): """Google TTS implementation (free, basic quality).""" def __init__(self, voice: str = "default", language: str = "en"): self.voice = voice self.language = language def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """Synthesize speech using gTTS.""" if output_path is None: output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3") try: tts = gTTS(text=text, lang=self.language) tts.save(output_path) return output_path except Exception as e: raise Exception(f"gTTS synthesis failed: {str(e)}") def get_available_voices(self) -> List[str]: """Get available gTTS voices.""" return VoiceConfig.TTS_PROVIDERS["gTTS (Free)"]["voices"] # ============================================================================ # Factory Functions # ============================================================================ def create_stt_provider(provider_name: str) -> STTProvider: """ Create an STT provider instance. Args: provider_name: Name of the provider (from VoiceConfig.STT_PROVIDERS) Returns: STTProvider instance """ provider_id = VoiceConfig.STT_PROVIDERS[provider_name]["id"] if provider_id == "openai_whisper": return OpenAIWhisperSTT() elif provider_id == "local_whisper_tiny": return LocalWhisperSTT(model_size="tiny") elif provider_id == "local_whisper_base": return LocalWhisperSTT(model_size="base") else: raise ValueError(f"Unknown STT provider: {provider_name}") def create_tts_provider(provider_name: str, voice: Optional[str] = None, language: str = "en") -> TTSProvider: """ Create a TTS provider instance. Args: provider_name: Name of the provider (from VoiceConfig.TTS_PROVIDERS) voice: Optional voice name language: Language code (ISO 639-1) Returns: TTSProvider instance """ provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"] provider_info = VoiceConfig.TTS_PROVIDERS[provider_name] # Use default voice if not specified if voice is None: voice = provider_info["voices"][0] if provider_info["voices"] else None if provider_id == "edge_tts": return EdgeTTSProvider(voice=voice) elif provider_id == "openai_tts": return OpenAITTSProvider(voice=voice) elif provider_id == "gtts": return GTTSProvider(voice=voice, language=language) else: raise ValueError(f"Unknown TTS provider: {provider_name}") def get_available_stt_providers() -> List[str]: """Get list of available STT provider names.""" return list(VoiceConfig.STT_PROVIDERS.keys()) def get_available_tts_providers() -> List[str]: """Get list of available TTS provider names.""" return list(VoiceConfig.TTS_PROVIDERS.keys()) def get_voices_for_provider(provider_name: str, language: str = "en") -> List[str]: """ Get available voices for a TTS provider, optionally filtered by language. Args: provider_name: Name of the provider language: Language code (ISO 639-1) for filtering voices Returns: List of available voices """ if provider_name not in VoiceConfig.TTS_PROVIDERS: return [] provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"] # For Edge TTS, return language-specific voices if provider_id == "edge_tts": return VoiceConfig.EDGE_TTS_VOICES.get(language, VoiceConfig.EDGE_TTS_VOICES.get("en", [])) # For other providers, return all voices return VoiceConfig.TTS_PROVIDERS[provider_name]["voices"] def get_provider_info(provider_name: str, provider_type: str = "tts") -> Dict: """ Get information about a provider. Args: provider_name: Name of the provider provider_type: "stt" or "tts" Returns: Provider information dictionary """ if provider_type == "tts": return VoiceConfig.TTS_PROVIDERS.get(provider_name, {}) else: return VoiceConfig.STT_PROVIDERS.get(provider_name, {}) def get_available_languages() -> List[str]: """Get list of available language names.""" return list(VoiceConfig.LANGUAGES.keys()) def get_language_code(language_name: str) -> str: """ Get language code from language name. Args: language_name: Display name of the language (e.g., "English") Returns: Language code (e.g., "en") """ return VoiceConfig.LANGUAGES.get(language_name, "en") def get_default_voice_for_language(language_name: str, provider_name: str = "Edge-TTS (Free)") -> str: """ Get the default voice for a specific language and provider. Args: language_name: Display name of the language provider_name: Name of the TTS provider Returns: Default voice ID for the language """ language_code = get_language_code(language_name) voices = get_voices_for_provider(provider_name, language_code) if voices: return voices[0] # Fallback to English if language not supported return VoiceConfig.DEFAULT_TTS_VOICE