Spaces:

jlazoflores
/

Language_Tutor

Running

App Files Files Community

joelazo commited on Nov 18

Commit

4b9febd

1 Parent(s): 26e714f

Initial commit

Browse files

Files changed (5) hide show

language_tutor.py +385 -0
pyproject.toml +15 -0
requirements.txt +7 -0
uv.lock +0 -0
voice_handler.py +480 -0

language_tutor.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import gradio as gr
+from huggingface_hub import InferenceClient
+from dotenv import load_dotenv
+from voice_handler import (
+    create_stt_provider,
+    create_tts_provider,
+    get_available_stt_providers,
+    get_available_tts_providers,
+    get_voices_for_provider,
+    get_available_languages,
+    get_language_code,
+    get_default_voice_for_language,
+    VoiceConfig
+)
+load_dotenv(override=True)
+# Initialize the Hugging Face Inference Client
+model_name = "swiss-ai/Apertus-70B-Instruct-2509"
+short_model_name = "Apertus-70B-Instruct"
+client = InferenceClient(model=model_name)
+def format_messages(message, chat_history, system_prompt):
+    """Format the conversation into messages list."""
+    messages = []
+    # Add system prompt if provided
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    # Add chat history (already in messages format)
+    messages.extend(chat_history)
+    # Add current message
+    messages.append({"role": "user", "content": message})
+    return messages
+def create_language_tutor_prompt(native_language, target_language):
+    """
+    Create a system prompt for the language tutor based on native and target languages.
+    Args:
+        native_language: User's native language
+        target_language: Language the user wants to learn
+    Returns:
+        System prompt string
+    """
+    prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}.
+Your role:
+- Respond primarily in {target_language} to provide immersive practice
+- Provide {native_language} translations when the user seems confused or asks for help
+- Correct mistakes gently and explain grammar rules when appropriate
+- Adjust your vocabulary and sentence complexity based on the user's level
+- Ask engaging questions to encourage conversation practice
+- Provide cultural context when relevant
+- Be patient, encouraging, and supportive
+Guidelines:
+- Keep responses conversational and natural
+- Use {target_language} for the main response
+- Include {native_language} explanations in parentheses when helpful
+- Praise progress and provide constructive feedback
+- Adapt difficulty based on the user's responses
+Start by greeting the user and asking what they'd like to practice today."""
+    return prompt
+def transcribe_audio(audio_path, stt_provider_name):
+    """
+    Transcribe audio to text using selected STT provider.
+    Args:
+        audio_path: Path to audio file
+        stt_provider_name: Name of STT provider
+    Returns:
+        Transcribed text or error message
+    """
+    if audio_path is None:
+        return ""
+    try:
+        stt_provider = create_stt_provider(stt_provider_name)
+        text = stt_provider.transcribe(audio_path)
+        return text
+    except Exception as e:
+        return f"[Transcription Error: {str(e)}]"
+def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"):
+    """
+    Synthesize text to speech using selected TTS provider.
+    Args:
+        text: Text to synthesize
+        tts_provider_name: Name of TTS provider
+        tts_voice: Voice to use
+        target_language: Target language name for TTS
+    Returns:
+        Path to generated audio file or None if failed
+    """
+    if not text or not text.strip():
+        return None
+    try:
+        language_code = get_language_code(target_language)
+        tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code)
+        audio_path = tts_provider.synthesize(text)
+        return audio_path
+    except Exception as e:
+        print(f"TTS Error: {str(e)}")
+        return None
+def update_voice_dropdown(tts_provider_name, target_language="English"):
+    """
+    Update the voice dropdown based on selected TTS provider and target language.
+    Args:
+        tts_provider_name: Name of TTS provider
+        target_language: Target language for voice selection
+    Returns:
+        Updated dropdown configuration
+    """
+    language_code = get_language_code(target_language)
+    voices = get_voices_for_provider(tts_provider_name, language_code)
+    return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
+def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p,
+         enable_tts, tts_provider_name, tts_voice, target_language):
+    """Generate a response from the Hugging Face hosted model."""
+    if not message.strip():
+        return "", chat_history, None
+    # Format the messages
+    messages = format_messages(message, chat_history, system_prompt)
+    try:
+        # Call the Hugging Face Inference API
+        response = client.chat_completion(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stream=False
+        )
+        # Extract the assistant's reply
+        assistant_message = response.choices[0].message.content
+        # Update chat history with messages format
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": assistant_message})
+        # Generate TTS audio if enabled
+        audio_output = None
+        if enable_tts:
+            audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language)
+        return "", chat_history, audio_output
+    except Exception as e:
+        error_message = f"Error: {str(e)}"
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": error_message})
+        return "", chat_history, None
+def process_voice_input(audio, stt_provider_name):
+    """
+    Process voice input and return transcribed text.
+    Args:
+        audio: Audio file from microphone
+        stt_provider_name: Name of STT provider
+    Returns:
+        Transcribed text
+    """
+    if audio is None:
+        return ""
+    transcribed_text = transcribe_audio(audio, stt_provider_name)
+    return transcribed_text
+# Create Gradio interface
+with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo:
+    gr.Markdown("# 🌍 Language Tutor powered by Apertus-70B")
+    gr.Markdown(f"Practice any language with an AI tutor powered by **{model_name}** - trained on 1000+ languages!")
+    gr.Markdown("⚠️ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.")
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(label="Conversation", height=400, type='messages')
+            # Text input section
+            with gr.Row():
+                msg = gr.Textbox(
+                    label="Your Message",
+                    placeholder="Type your message here...",
+                    scale=4,
+                    lines=2
+                )
+                submit = gr.Button("Send", scale=1, variant="primary")
+            # Voice input section
+            with gr.Row():
+                with gr.Column(scale=4):
+                    voice_input = gr.Audio(
+                        sources=["microphone"],
+                        type="filepath",
+                        label="Voice Input (Click to Record)"
+                    )
+                with gr.Column(scale=1):
+                    transcribe_btn = gr.Button("Transcribe", variant="secondary")
+            # Voice output section
+            voice_output = gr.Audio(
+                label="Assistant Voice Response",
+                autoplay=True,
+                visible=True
+            )
+            clear = gr.Button("Clear Conversation")
+        with gr.Column(scale=1):
+            gr.Markdown("### 🌐 Language Settings")
+            native_language = gr.Dropdown(
+                choices=get_available_languages(),
+                value="English",
+                label="Your Native Language",
+                info="Language for explanations and help"
+            )
+            target_language = gr.Dropdown(
+                choices=get_available_languages(),
+                value="Spanish",
+                label="Language to Practice",
+                info="Language you want to learn"
+            )
+            system_prompt = gr.Textbox(
+                label="System Prompt (Auto-generated)",
+                placeholder="System prompt is automatically generated based on language selection...",
+                lines=5,
+                value=create_language_tutor_prompt("English", "Spanish"),
+                interactive=True,
+                info="You can customize this if needed"
+            )
+            gr.Markdown("### Generation Parameters")
+            max_tokens = gr.Slider(
+                minimum=50,
+                maximum=2048,
+                value=512,
+                step=50,
+                label="Max Tokens",
+                info="Maximum length of the response"
+            )
+            temperature = gr.Slider(
+                minimum=0.0,
+                maximum=2.0,
+                value=0.7,
+                step=0.1,
+                label="Temperature",
+                info="Higher = more creative, Lower = more focused"
+            )
+            top_p = gr.Slider(
+                minimum=0.0,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top P",
+                info="Nucleus sampling threshold"
+            )
+            gr.Markdown("### Voice Settings")
+            enable_voice_input = gr.Checkbox(
+                label="Enable Voice Input (STT)",
+                value=True,
+                info="Transcribe voice to text"
+            )
+            stt_provider = gr.Dropdown(
+                choices=get_available_stt_providers(),
+                value=VoiceConfig.DEFAULT_STT,
+                label="Speech-to-Text Provider",
+                info="Choose quality/cost tier"
+            )
+            enable_voice_output = gr.Checkbox(
+                label="Enable Voice Output (TTS)",
+                value=False,
+                info="Convert responses to speech"
+            )
+            tts_provider = gr.Dropdown(
+                choices=get_available_tts_providers(),
+                value=VoiceConfig.DEFAULT_TTS,
+                label="Text-to-Speech Provider",
+                info="Choose quality/cost tier"
+            )
+            tts_voice = gr.Dropdown(
+                choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("Spanish")),
+                value=get_default_voice_for_language("Spanish", VoiceConfig.DEFAULT_TTS),
+                label="TTS Voice",
+                info="Voice automatically matched to target language"
+            )
+    # Event handlers
+    # Update system prompt when languages change
+    def update_system_prompt(native_lang, target_lang):
+        return create_language_tutor_prompt(native_lang, target_lang)
+    native_language.change(
+        update_system_prompt,
+        inputs=[native_language, target_language],
+        outputs=[system_prompt]
+    )
+    target_language.change(
+        update_system_prompt,
+        inputs=[native_language, target_language],
+        outputs=[system_prompt]
+    )
+    # Update TTS voice dropdown when target language or provider changes
+    target_language.change(
+        update_voice_dropdown,
+        inputs=[tts_provider, target_language],
+        outputs=[tts_voice]
+    )
+    tts_provider.change(
+        update_voice_dropdown,
+        inputs=[tts_provider, target_language],
+        outputs=[tts_voice]
+    )
+    # Text message submit
+    submit.click(
+        chat,
+        inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
+                enable_voice_output, tts_provider, tts_voice, target_language],
+        outputs=[msg, chatbot, voice_output]
+    )
+    msg.submit(
+        chat,
+        inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
+                enable_voice_output, tts_provider, tts_voice, target_language],
+        outputs=[msg, chatbot, voice_output]
+    )
+    # Voice input transcription
+    transcribe_btn.click(
+        process_voice_input,
+        inputs=[voice_input, stt_provider],
+        outputs=[msg]
+    )
+    # Clear conversation
+    clear.click(
+        lambda: ([], None),
+        outputs=[chatbot, voice_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(share=False)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "apertus"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.13"
+dependencies = [
+    "dotenv>=0.9.9",
+    "edge-tts>=7.2.3",
+    "gradio>=5.49.1",
+    "gtts>=2.5.4",
+    "huggingface-hub>=1.1.4",
+    "openai>=2.8.0",
+    "openai-whisper>=20250625",
+    "python-dotenv>=1.2.1",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+huggingface_hub
+python-dotenv
+openai
+edge-tts
+openai-whisper
+gtts

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_handler.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""
+Voice Handler Module
+Provides Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities
+with multiple provider options for different cost/quality tiers.
+"""
+import os
+import tempfile
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Optional, List, Dict
+import asyncio
+# Import voice processing libraries
+from openai import OpenAI
+import whisper
+import edge_tts
+from gtts import gTTS
+# ============================================================================
+# Configuration and Cost Tiers
+# ============================================================================
+class VoiceConfig:
+    """Configuration for voice providers and their characteristics."""
+    # Language definitions with their codes and display names
+    LANGUAGES = {
+        "English": "en",
+        "Spanish": "es",
+        "French": "fr",
+        "German": "de",
+        "Italian": "it",
+        "Portuguese": "pt",
+        "Dutch": "nl",
+        "Russian": "ru",
+        "Chinese (Mandarin)": "zh",
+        "Japanese": "ja",
+        "Korean": "ko",
+        "Arabic": "ar",
+        "Hindi": "hi",
+        "Turkish": "tr",
+        "Polish": "pl",
+        "Swedish": "sv",
+        "Danish": "da",
+        "Norwegian": "no",
+        "Finnish": "fi",
+        "Greek": "el",
+        "Czech": "cs",
+        "Romanian": "ro",
+        "Hungarian": "hu",
+        "Thai": "th",
+        "Vietnamese": "vi",
+        "Indonesian": "id",
+        "Malay": "ms",
+        "Filipino": "fil",
+        "Hebrew": "he",
+        "Ukrainian": "uk",
+    }
+    # Multilingual Edge TTS voices organized by language
+    EDGE_TTS_VOICES = {
+        "en": ["en-US-JennyNeural", "en-US-GuyNeural", "en-US-AriaNeural", "en-GB-SoniaNeural", "en-GB-RyanNeural", "en-AU-NatashaNeural"],
+        "es": ["es-ES-ElviraNeural", "es-ES-AlvaroNeural", "es-MX-DaliaNeural", "es-MX-JorgeNeural", "es-AR-ElenaNeural"],
+        "fr": ["fr-FR-DeniseNeural", "fr-FR-HenriNeural", "fr-CA-SylvieNeural", "fr-CA-AntoineNeural", "fr-BE-CharlineNeural"],
+        "de": ["de-DE-KatjaNeural", "de-DE-ConradNeural", "de-AT-IngridNeural", "de-CH-LeniNeural"],
+        "it": ["it-IT-ElsaNeural", "it-IT-DiegoNeural", "it-IT-IsabellaNeural"],
+        "pt": ["pt-BR-FranciscaNeural", "pt-BR-AntonioNeural", "pt-PT-RaquelNeural", "pt-PT-DuarteNeural"],
+        "nl": ["nl-NL-ColetteNeural", "nl-NL-MaartenNeural", "nl-BE-DenaNeural"],
+        "ru": ["ru-RU-SvetlanaNeural", "ru-RU-DmitryNeural"],
+        "zh": ["zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural", "zh-TW-HsiaoChenNeural", "zh-HK-HiuMaanNeural"],
+        "ja": ["ja-JP-NanamiNeural", "ja-JP-KeitaNeural"],
+        "ko": ["ko-KR-SunHiNeural", "ko-KR-InJoonNeural"],
+        "ar": ["ar-SA-ZariyahNeural", "ar-SA-HamedNeural", "ar-EG-SalmaNeural"],
+        "hi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"],
+        "tr": ["tr-TR-EmelNeural", "tr-TR-AhmetNeural"],
+        "pl": ["pl-PL-ZofiaNeural", "pl-PL-MarekNeural"],
+        "sv": ["sv-SE-SofieNeural", "sv-SE-MattiasNeural"],
+        "da": ["da-DK-ChristelNeural", "da-DK-JeppeNeural"],
+        "no": ["nb-NO-PernilleNeural", "nb-NO-FinnNeural"],
+        "fi": ["fi-FI-NooraNeural", "fi-FI-HarriNeural"],
+        "el": ["el-GR-AthinaNeural", "el-GR-NestorasNeural"],
+        "cs": ["cs-CZ-VlastaNeural", "cs-CZ-AntoninNeural"],
+        "ro": ["ro-RO-AlinaNeural", "ro-RO-EmilNeural"],
+        "hu": ["hu-HU-NoemiNeural", "hu-HU-TamasNeural"],
+        "th": ["th-TH-PremwadeeNeural", "th-TH-NiwatNeural"],
+        "vi": ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"],
+        "id": ["id-ID-GadisNeural", "id-ID-ArdiNeural"],
+        "ms": ["ms-MY-YasminNeural", "ms-MY-OsmanNeural"],
+        "fil": ["fil-PH-BlessicaNeural", "fil-PH-AngeloNeural"],
+        "he": ["he-IL-HilaNeural", "he-IL-AvriNeural"],
+        "uk": ["uk-UA-PolinaNeural", "uk-UA-OstapNeural"],
+    }
+    # STT Provider definitions
+    STT_PROVIDERS = {
+        "OpenAI Whisper API": {
+            "id": "openai_whisper",
+            "cost_tier": "medium",
+            "cost_per_minute": 0.006,
+            "requires_api_key": True,
+        },
+        "Local Whisper (Tiny)": {
+            "id": "local_whisper_tiny",
+            "cost_tier": "free",
+            "cost_per_minute": 0.0,
+            "requires_api_key": False,
+        },
+        "Local Whisper (Base)": {
+            "id": "local_whisper_base",
+            "cost_tier": "free",
+            "cost_per_minute": 0.0,
+            "requires_api_key": False,
+        },
+    }
+    # TTS Provider definitions
+    TTS_PROVIDERS = {
+        "Edge-TTS (Free)": {
+            "id": "edge_tts",
+            "cost_tier": "free",
+            "cost_per_1k_chars": 0.0,
+            "requires_api_key": False,
+            "voices": []  # Will be populated dynamically based on language
+        },
+        "OpenAI TTS": {
+            "id": "openai_tts",
+            "cost_tier": "medium",
+            "cost_per_1k_chars": 0.015,
+            "requires_api_key": True,
+            "voices": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
+        },
+        "gTTS (Free)": {
+            "id": "gtts",
+            "cost_tier": "free",
+            "cost_per_1k_chars": 0.0,
+            "requires_api_key": False,
+            "voices": ["default"]
+        },
+    }
+    # Default selections
+    DEFAULT_STT = "OpenAI Whisper API"
+    DEFAULT_TTS = "Edge-TTS (Free)"
+    DEFAULT_TTS_VOICE = "en-US-JennyNeural"
+    DEFAULT_LANGUAGE = "English"
+# ============================================================================
+# Abstract Base Classes
+# ============================================================================
+class STTProvider(ABC):
+    """Abstract base class for Speech-to-Text providers."""
+    @abstractmethod
+    def transcribe(self, audio_path: str) -> str:
+        """
+        Transcribe audio file to text.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Transcribed text
+        """
+        pass
+class TTSProvider(ABC):
+    """Abstract base class for Text-to-Speech providers."""
+    @abstractmethod
+    def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
+        """
+        Synthesize text to speech.
+        Args:
+            text: Text to convert to speech
+            output_path: Optional path to save audio file
+        Returns:
+            Path to generated audio file
+        """
+        pass
+    @abstractmethod
+    def get_available_voices(self) -> List[str]:
+        """Get list of available voices for this provider."""
+        pass
+# ============================================================================
+# STT Provider Implementations
+# ============================================================================
+class OpenAIWhisperSTT(STTProvider):
+    """OpenAI Whisper API implementation."""
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
+        self.client = OpenAI(api_key=self.api_key)
+    def transcribe(self, audio_path: str) -> str:
+        """Transcribe audio using OpenAI Whisper API."""
+        try:
+            with open(audio_path, "rb") as audio_file:
+                transcript = self.client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=audio_file
+                )
+            return transcript.text
+        except Exception as e:
+            raise Exception(f"OpenAI Whisper transcription failed: {str(e)}")
+class LocalWhisperSTT(STTProvider):
+    """Local Whisper model implementation."""
+    def __init__(self, model_size: str = "base"):
+        """
+        Initialize local Whisper model.
+        Args:
+            model_size: Model size (tiny, base, small, medium, large)
+        """
+        self.model_size = model_size
+        self.model = None
+    def _load_model(self):
+        """Lazy load the model."""
+        if self.model is None:
+            self.model = whisper.load_model(self.model_size)
+    def transcribe(self, audio_path: str) -> str:
+        """Transcribe audio using local Whisper model."""
+        self._load_model()
+        try:
+            result = self.model.transcribe(audio_path)
+            return result["text"]
+        except Exception as e:
+            raise Exception(f"Local Whisper transcription failed: {str(e)}")
+# ============================================================================
+# TTS Provider Implementations
+# ============================================================================
+class EdgeTTSProvider(TTSProvider):
+    """Microsoft Edge TTS implementation (free)."""
+    def __init__(self, voice: str = "en-US-JennyNeural"):
+        self.voice = voice
+    def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
+        """Synthesize speech using Edge TTS."""
+        if output_path is None:
+            output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
+        try:
+            # Edge TTS requires async
+            async def _synthesize():
+                communicate = edge_tts.Communicate(text, self.voice)
+                await communicate.save(output_path)
+            asyncio.run(_synthesize())
+            return output_path
+        except Exception as e:
+            raise Exception(f"Edge TTS synthesis failed: {str(e)}")
+    def get_available_voices(self) -> List[str]:
+        """Get available Edge TTS voices."""
+        return VoiceConfig.TTS_PROVIDERS["Edge-TTS (Free)"]["voices"]
+class OpenAITTSProvider(TTSProvider):
+    """OpenAI TTS implementation."""
+    def __init__(self, voice: str = "nova", api_key: Optional[str] = None):
+        self.voice = voice
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
+        self.client = OpenAI(api_key=self.api_key)
+    def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
+        """Synthesize speech using OpenAI TTS."""
+        if output_path is None:
+            output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
+        try:
+            response = self.client.audio.speech.create(
+                model="tts-1",
+                voice=self.voice,
+                input=text
+            )
+            response.stream_to_file(output_path)
+            return output_path
+        except Exception as e:
+            raise Exception(f"OpenAI TTS synthesis failed: {str(e)}")
+    def get_available_voices(self) -> List[str]:
+        """Get available OpenAI TTS voices."""
+        return VoiceConfig.TTS_PROVIDERS["OpenAI TTS"]["voices"]
+class GTTSProvider(TTSProvider):
+    """Google TTS implementation (free, basic quality)."""
+    def __init__(self, voice: str = "default", language: str = "en"):
+        self.voice = voice
+        self.language = language
+    def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
+        """Synthesize speech using gTTS."""
+        if output_path is None:
+            output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
+        try:
+            tts = gTTS(text=text, lang=self.language)
+            tts.save(output_path)
+            return output_path
+        except Exception as e:
+            raise Exception(f"gTTS synthesis failed: {str(e)}")
+    def get_available_voices(self) -> List[str]:
+        """Get available gTTS voices."""
+        return VoiceConfig.TTS_PROVIDERS["gTTS (Free)"]["voices"]
+# ============================================================================
+# Factory Functions
+# ============================================================================
+def create_stt_provider(provider_name: str) -> STTProvider:
+    """
+    Create an STT provider instance.
+    Args:
+        provider_name: Name of the provider (from VoiceConfig.STT_PROVIDERS)
+    Returns:
+        STTProvider instance
+    """
+    provider_id = VoiceConfig.STT_PROVIDERS[provider_name]["id"]
+    if provider_id == "openai_whisper":
+        return OpenAIWhisperSTT()
+    elif provider_id == "local_whisper_tiny":
+        return LocalWhisperSTT(model_size="tiny")
+    elif provider_id == "local_whisper_base":
+        return LocalWhisperSTT(model_size="base")
+    else:
+        raise ValueError(f"Unknown STT provider: {provider_name}")
+def create_tts_provider(provider_name: str, voice: Optional[str] = None, language: str = "en") -> TTSProvider:
+    """
+    Create a TTS provider instance.
+    Args:
+        provider_name: Name of the provider (from VoiceConfig.TTS_PROVIDERS)
+        voice: Optional voice name
+        language: Language code (ISO 639-1)
+    Returns:
+        TTSProvider instance
+    """
+    provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"]
+    provider_info = VoiceConfig.TTS_PROVIDERS[provider_name]
+    # Use default voice if not specified
+    if voice is None:
+        voice = provider_info["voices"][0] if provider_info["voices"] else None
+    if provider_id == "edge_tts":
+        return EdgeTTSProvider(voice=voice)
+    elif provider_id == "openai_tts":
+        return OpenAITTSProvider(voice=voice)
+    elif provider_id == "gtts":
+        return GTTSProvider(voice=voice, language=language)
+    else:
+        raise ValueError(f"Unknown TTS provider: {provider_name}")
+def get_available_stt_providers() -> List[str]:
+    """Get list of available STT provider names."""
+    return list(VoiceConfig.STT_PROVIDERS.keys())
+def get_available_tts_providers() -> List[str]:
+    """Get list of available TTS provider names."""
+    return list(VoiceConfig.TTS_PROVIDERS.keys())
+def get_voices_for_provider(provider_name: str, language: str = "en") -> List[str]:
+    """
+    Get available voices for a TTS provider, optionally filtered by language.
+    Args:
+        provider_name: Name of the provider
+        language: Language code (ISO 639-1) for filtering voices
+    Returns:
+        List of available voices
+    """
+    if provider_name not in VoiceConfig.TTS_PROVIDERS:
+        return []
+    provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"]
+    # For Edge TTS, return language-specific voices
+    if provider_id == "edge_tts":
+        return VoiceConfig.EDGE_TTS_VOICES.get(language, VoiceConfig.EDGE_TTS_VOICES.get("en", []))
+    # For other providers, return all voices
+    return VoiceConfig.TTS_PROVIDERS[provider_name]["voices"]
+def get_provider_info(provider_name: str, provider_type: str = "tts") -> Dict:
+    """
+    Get information about a provider.
+    Args:
+        provider_name: Name of the provider
+        provider_type: "stt" or "tts"
+    Returns:
+        Provider information dictionary
+    """
+    if provider_type == "tts":
+        return VoiceConfig.TTS_PROVIDERS.get(provider_name, {})
+    else:
+        return VoiceConfig.STT_PROVIDERS.get(provider_name, {})
+def get_available_languages() -> List[str]:
+    """Get list of available language names."""
+    return list(VoiceConfig.LANGUAGES.keys())
+def get_language_code(language_name: str) -> str:
+    """
+    Get language code from language name.
+    Args:
+        language_name: Display name of the language (e.g., "English")
+    Returns:
+        Language code (e.g., "en")
+    """
+    return VoiceConfig.LANGUAGES.get(language_name, "en")
+def get_default_voice_for_language(language_name: str, provider_name: str = "Edge-TTS (Free)") -> str:
+    """
+    Get the default voice for a specific language and provider.
+    Args:
+        language_name: Display name of the language
+        provider_name: Name of the TTS provider
+    Returns:
+        Default voice ID for the language
+    """
+    language_code = get_language_code(language_name)
+    voices = get_voices_for_provider(provider_name, language_code)
+    if voices:
+        return voices[0]
+    # Fallback to English if language not supported
+    return VoiceConfig.DEFAULT_TTS_VOICE