joelazo commited on
Commit
4b9febd
·
1 Parent(s): 26e714f

Initial commit

Browse files
Files changed (5) hide show
  1. language_tutor.py +385 -0
  2. pyproject.toml +15 -0
  3. requirements.txt +7 -0
  4. uv.lock +0 -0
  5. voice_handler.py +480 -0
language_tutor.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+ from dotenv import load_dotenv
4
+ from voice_handler import (
5
+ create_stt_provider,
6
+ create_tts_provider,
7
+ get_available_stt_providers,
8
+ get_available_tts_providers,
9
+ get_voices_for_provider,
10
+ get_available_languages,
11
+ get_language_code,
12
+ get_default_voice_for_language,
13
+ VoiceConfig
14
+ )
15
+
16
+ load_dotenv(override=True)
17
+
18
+ # Initialize the Hugging Face Inference Client
19
+ model_name = "swiss-ai/Apertus-70B-Instruct-2509"
20
+ short_model_name = "Apertus-70B-Instruct"
21
+ client = InferenceClient(model=model_name)
22
+
23
+ def format_messages(message, chat_history, system_prompt):
24
+ """Format the conversation into messages list."""
25
+ messages = []
26
+
27
+ # Add system prompt if provided
28
+ if system_prompt.strip():
29
+ messages.append({"role": "system", "content": system_prompt})
30
+
31
+ # Add chat history (already in messages format)
32
+ messages.extend(chat_history)
33
+
34
+ # Add current message
35
+ messages.append({"role": "user", "content": message})
36
+
37
+ return messages
38
+
39
+
40
+ def create_language_tutor_prompt(native_language, target_language):
41
+ """
42
+ Create a system prompt for the language tutor based on native and target languages.
43
+
44
+ Args:
45
+ native_language: User's native language
46
+ target_language: Language the user wants to learn
47
+
48
+ Returns:
49
+ System prompt string
50
+ """
51
+ prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}.
52
+
53
+ Your role:
54
+ - Respond primarily in {target_language} to provide immersive practice
55
+ - Provide {native_language} translations when the user seems confused or asks for help
56
+ - Correct mistakes gently and explain grammar rules when appropriate
57
+ - Adjust your vocabulary and sentence complexity based on the user's level
58
+ - Ask engaging questions to encourage conversation practice
59
+ - Provide cultural context when relevant
60
+ - Be patient, encouraging, and supportive
61
+
62
+ Guidelines:
63
+ - Keep responses conversational and natural
64
+ - Use {target_language} for the main response
65
+ - Include {native_language} explanations in parentheses when helpful
66
+ - Praise progress and provide constructive feedback
67
+ - Adapt difficulty based on the user's responses
68
+
69
+ Start by greeting the user and asking what they'd like to practice today."""
70
+
71
+ return prompt
72
+
73
+
74
+ def transcribe_audio(audio_path, stt_provider_name):
75
+ """
76
+ Transcribe audio to text using selected STT provider.
77
+
78
+ Args:
79
+ audio_path: Path to audio file
80
+ stt_provider_name: Name of STT provider
81
+
82
+ Returns:
83
+ Transcribed text or error message
84
+ """
85
+ if audio_path is None:
86
+ return ""
87
+
88
+ try:
89
+ stt_provider = create_stt_provider(stt_provider_name)
90
+ text = stt_provider.transcribe(audio_path)
91
+ return text
92
+ except Exception as e:
93
+ return f"[Transcription Error: {str(e)}]"
94
+
95
+
96
+ def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"):
97
+ """
98
+ Synthesize text to speech using selected TTS provider.
99
+
100
+ Args:
101
+ text: Text to synthesize
102
+ tts_provider_name: Name of TTS provider
103
+ tts_voice: Voice to use
104
+ target_language: Target language name for TTS
105
+
106
+ Returns:
107
+ Path to generated audio file or None if failed
108
+ """
109
+ if not text or not text.strip():
110
+ return None
111
+
112
+ try:
113
+ language_code = get_language_code(target_language)
114
+ tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code)
115
+ audio_path = tts_provider.synthesize(text)
116
+ return audio_path
117
+ except Exception as e:
118
+ print(f"TTS Error: {str(e)}")
119
+ return None
120
+
121
+
122
+ def update_voice_dropdown(tts_provider_name, target_language="English"):
123
+ """
124
+ Update the voice dropdown based on selected TTS provider and target language.
125
+
126
+ Args:
127
+ tts_provider_name: Name of TTS provider
128
+ target_language: Target language for voice selection
129
+
130
+ Returns:
131
+ Updated dropdown configuration
132
+ """
133
+ language_code = get_language_code(target_language)
134
+ voices = get_voices_for_provider(tts_provider_name, language_code)
135
+ return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
136
+
137
+ def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p,
138
+ enable_tts, tts_provider_name, tts_voice, target_language):
139
+ """Generate a response from the Hugging Face hosted model."""
140
+ if not message.strip():
141
+ return "", chat_history, None
142
+
143
+ # Format the messages
144
+ messages = format_messages(message, chat_history, system_prompt)
145
+
146
+ try:
147
+ # Call the Hugging Face Inference API
148
+ response = client.chat_completion(
149
+ messages=messages,
150
+ max_tokens=max_tokens,
151
+ temperature=temperature,
152
+ top_p=top_p,
153
+ stream=False
154
+ )
155
+
156
+ # Extract the assistant's reply
157
+ assistant_message = response.choices[0].message.content
158
+
159
+ # Update chat history with messages format
160
+ chat_history.append({"role": "user", "content": message})
161
+ chat_history.append({"role": "assistant", "content": assistant_message})
162
+
163
+ # Generate TTS audio if enabled
164
+ audio_output = None
165
+ if enable_tts:
166
+ audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language)
167
+
168
+ return "", chat_history, audio_output
169
+
170
+ except Exception as e:
171
+ error_message = f"Error: {str(e)}"
172
+ chat_history.append({"role": "user", "content": message})
173
+ chat_history.append({"role": "assistant", "content": error_message})
174
+ return "", chat_history, None
175
+
176
+
177
+ def process_voice_input(audio, stt_provider_name):
178
+ """
179
+ Process voice input and return transcribed text.
180
+
181
+ Args:
182
+ audio: Audio file from microphone
183
+ stt_provider_name: Name of STT provider
184
+
185
+ Returns:
186
+ Transcribed text
187
+ """
188
+ if audio is None:
189
+ return ""
190
+
191
+ transcribed_text = transcribe_audio(audio, stt_provider_name)
192
+ return transcribed_text
193
+
194
+ # Create Gradio interface
195
+ with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo:
196
+ gr.Markdown("# 🌍 Language Tutor powered by Apertus-70B")
197
+ gr.Markdown(f"Practice any language with an AI tutor powered by **{model_name}** - trained on 1000+ languages!")
198
+ gr.Markdown("⚠️ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.")
199
+
200
+ with gr.Row():
201
+ with gr.Column(scale=3):
202
+ chatbot = gr.Chatbot(label="Conversation", height=400, type='messages')
203
+
204
+ # Text input section
205
+ with gr.Row():
206
+ msg = gr.Textbox(
207
+ label="Your Message",
208
+ placeholder="Type your message here...",
209
+ scale=4,
210
+ lines=2
211
+ )
212
+ submit = gr.Button("Send", scale=1, variant="primary")
213
+
214
+ # Voice input section
215
+ with gr.Row():
216
+ with gr.Column(scale=4):
217
+ voice_input = gr.Audio(
218
+ sources=["microphone"],
219
+ type="filepath",
220
+ label="Voice Input (Click to Record)"
221
+ )
222
+ with gr.Column(scale=1):
223
+ transcribe_btn = gr.Button("Transcribe", variant="secondary")
224
+
225
+ # Voice output section
226
+ voice_output = gr.Audio(
227
+ label="Assistant Voice Response",
228
+ autoplay=True,
229
+ visible=True
230
+ )
231
+
232
+ clear = gr.Button("Clear Conversation")
233
+
234
+ with gr.Column(scale=1):
235
+ gr.Markdown("### 🌐 Language Settings")
236
+
237
+ native_language = gr.Dropdown(
238
+ choices=get_available_languages(),
239
+ value="English",
240
+ label="Your Native Language",
241
+ info="Language for explanations and help"
242
+ )
243
+
244
+ target_language = gr.Dropdown(
245
+ choices=get_available_languages(),
246
+ value="Spanish",
247
+ label="Language to Practice",
248
+ info="Language you want to learn"
249
+ )
250
+
251
+ system_prompt = gr.Textbox(
252
+ label="System Prompt (Auto-generated)",
253
+ placeholder="System prompt is automatically generated based on language selection...",
254
+ lines=5,
255
+ value=create_language_tutor_prompt("English", "Spanish"),
256
+ interactive=True,
257
+ info="You can customize this if needed"
258
+ )
259
+
260
+ gr.Markdown("### Generation Parameters")
261
+
262
+ max_tokens = gr.Slider(
263
+ minimum=50,
264
+ maximum=2048,
265
+ value=512,
266
+ step=50,
267
+ label="Max Tokens",
268
+ info="Maximum length of the response"
269
+ )
270
+
271
+ temperature = gr.Slider(
272
+ minimum=0.0,
273
+ maximum=2.0,
274
+ value=0.7,
275
+ step=0.1,
276
+ label="Temperature",
277
+ info="Higher = more creative, Lower = more focused"
278
+ )
279
+
280
+ top_p = gr.Slider(
281
+ minimum=0.0,
282
+ maximum=1.0,
283
+ value=0.9,
284
+ step=0.05,
285
+ label="Top P",
286
+ info="Nucleus sampling threshold"
287
+ )
288
+
289
+ gr.Markdown("### Voice Settings")
290
+
291
+ enable_voice_input = gr.Checkbox(
292
+ label="Enable Voice Input (STT)",
293
+ value=True,
294
+ info="Transcribe voice to text"
295
+ )
296
+
297
+ stt_provider = gr.Dropdown(
298
+ choices=get_available_stt_providers(),
299
+ value=VoiceConfig.DEFAULT_STT,
300
+ label="Speech-to-Text Provider",
301
+ info="Choose quality/cost tier"
302
+ )
303
+
304
+ enable_voice_output = gr.Checkbox(
305
+ label="Enable Voice Output (TTS)",
306
+ value=False,
307
+ info="Convert responses to speech"
308
+ )
309
+
310
+ tts_provider = gr.Dropdown(
311
+ choices=get_available_tts_providers(),
312
+ value=VoiceConfig.DEFAULT_TTS,
313
+ label="Text-to-Speech Provider",
314
+ info="Choose quality/cost tier"
315
+ )
316
+
317
+ tts_voice = gr.Dropdown(
318
+ choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("Spanish")),
319
+ value=get_default_voice_for_language("Spanish", VoiceConfig.DEFAULT_TTS),
320
+ label="TTS Voice",
321
+ info="Voice automatically matched to target language"
322
+ )
323
+
324
+ # Event handlers
325
+
326
+ # Update system prompt when languages change
327
+ def update_system_prompt(native_lang, target_lang):
328
+ return create_language_tutor_prompt(native_lang, target_lang)
329
+
330
+ native_language.change(
331
+ update_system_prompt,
332
+ inputs=[native_language, target_language],
333
+ outputs=[system_prompt]
334
+ )
335
+
336
+ target_language.change(
337
+ update_system_prompt,
338
+ inputs=[native_language, target_language],
339
+ outputs=[system_prompt]
340
+ )
341
+
342
+ # Update TTS voice dropdown when target language or provider changes
343
+ target_language.change(
344
+ update_voice_dropdown,
345
+ inputs=[tts_provider, target_language],
346
+ outputs=[tts_voice]
347
+ )
348
+
349
+ tts_provider.change(
350
+ update_voice_dropdown,
351
+ inputs=[tts_provider, target_language],
352
+ outputs=[tts_voice]
353
+ )
354
+
355
+ # Text message submit
356
+ submit.click(
357
+ chat,
358
+ inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
359
+ enable_voice_output, tts_provider, tts_voice, target_language],
360
+ outputs=[msg, chatbot, voice_output]
361
+ )
362
+
363
+ msg.submit(
364
+ chat,
365
+ inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
366
+ enable_voice_output, tts_provider, tts_voice, target_language],
367
+ outputs=[msg, chatbot, voice_output]
368
+ )
369
+
370
+ # Voice input transcription
371
+ transcribe_btn.click(
372
+ process_voice_input,
373
+ inputs=[voice_input, stt_provider],
374
+ outputs=[msg]
375
+ )
376
+
377
+ # Clear conversation
378
+ clear.click(
379
+ lambda: ([], None),
380
+ outputs=[chatbot, voice_output]
381
+ )
382
+
383
+ # Launch the app
384
+ if __name__ == "__main__":
385
+ demo.launch(share=False)
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "apertus"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ requires-python = ">=3.13"
6
+ dependencies = [
7
+ "dotenv>=0.9.9",
8
+ "edge-tts>=7.2.3",
9
+ "gradio>=5.49.1",
10
+ "gtts>=2.5.4",
11
+ "huggingface-hub>=1.1.4",
12
+ "openai>=2.8.0",
13
+ "openai-whisper>=20250625",
14
+ "python-dotenv>=1.2.1",
15
+ ]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ huggingface_hub
3
+ python-dotenv
4
+ openai
5
+ edge-tts
6
+ openai-whisper
7
+ gtts
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
voice_handler.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Handler Module
3
+ Provides Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities
4
+ with multiple provider options for different cost/quality tiers.
5
+ """
6
+
7
+ import os
8
+ import tempfile
9
+ from abc import ABC, abstractmethod
10
+ from pathlib import Path
11
+ from typing import Optional, List, Dict
12
+ import asyncio
13
+
14
+ # Import voice processing libraries
15
+ from openai import OpenAI
16
+ import whisper
17
+ import edge_tts
18
+ from gtts import gTTS
19
+
20
+
21
+ # ============================================================================
22
+ # Configuration and Cost Tiers
23
+ # ============================================================================
24
+
25
+ class VoiceConfig:
26
+ """Configuration for voice providers and their characteristics."""
27
+
28
+ # Language definitions with their codes and display names
29
+ LANGUAGES = {
30
+ "English": "en",
31
+ "Spanish": "es",
32
+ "French": "fr",
33
+ "German": "de",
34
+ "Italian": "it",
35
+ "Portuguese": "pt",
36
+ "Dutch": "nl",
37
+ "Russian": "ru",
38
+ "Chinese (Mandarin)": "zh",
39
+ "Japanese": "ja",
40
+ "Korean": "ko",
41
+ "Arabic": "ar",
42
+ "Hindi": "hi",
43
+ "Turkish": "tr",
44
+ "Polish": "pl",
45
+ "Swedish": "sv",
46
+ "Danish": "da",
47
+ "Norwegian": "no",
48
+ "Finnish": "fi",
49
+ "Greek": "el",
50
+ "Czech": "cs",
51
+ "Romanian": "ro",
52
+ "Hungarian": "hu",
53
+ "Thai": "th",
54
+ "Vietnamese": "vi",
55
+ "Indonesian": "id",
56
+ "Malay": "ms",
57
+ "Filipino": "fil",
58
+ "Hebrew": "he",
59
+ "Ukrainian": "uk",
60
+ }
61
+
62
+ # Multilingual Edge TTS voices organized by language
63
+ EDGE_TTS_VOICES = {
64
+ "en": ["en-US-JennyNeural", "en-US-GuyNeural", "en-US-AriaNeural", "en-GB-SoniaNeural", "en-GB-RyanNeural", "en-AU-NatashaNeural"],
65
+ "es": ["es-ES-ElviraNeural", "es-ES-AlvaroNeural", "es-MX-DaliaNeural", "es-MX-JorgeNeural", "es-AR-ElenaNeural"],
66
+ "fr": ["fr-FR-DeniseNeural", "fr-FR-HenriNeural", "fr-CA-SylvieNeural", "fr-CA-AntoineNeural", "fr-BE-CharlineNeural"],
67
+ "de": ["de-DE-KatjaNeural", "de-DE-ConradNeural", "de-AT-IngridNeural", "de-CH-LeniNeural"],
68
+ "it": ["it-IT-ElsaNeural", "it-IT-DiegoNeural", "it-IT-IsabellaNeural"],
69
+ "pt": ["pt-BR-FranciscaNeural", "pt-BR-AntonioNeural", "pt-PT-RaquelNeural", "pt-PT-DuarteNeural"],
70
+ "nl": ["nl-NL-ColetteNeural", "nl-NL-MaartenNeural", "nl-BE-DenaNeural"],
71
+ "ru": ["ru-RU-SvetlanaNeural", "ru-RU-DmitryNeural"],
72
+ "zh": ["zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural", "zh-TW-HsiaoChenNeural", "zh-HK-HiuMaanNeural"],
73
+ "ja": ["ja-JP-NanamiNeural", "ja-JP-KeitaNeural"],
74
+ "ko": ["ko-KR-SunHiNeural", "ko-KR-InJoonNeural"],
75
+ "ar": ["ar-SA-ZariyahNeural", "ar-SA-HamedNeural", "ar-EG-SalmaNeural"],
76
+ "hi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"],
77
+ "tr": ["tr-TR-EmelNeural", "tr-TR-AhmetNeural"],
78
+ "pl": ["pl-PL-ZofiaNeural", "pl-PL-MarekNeural"],
79
+ "sv": ["sv-SE-SofieNeural", "sv-SE-MattiasNeural"],
80
+ "da": ["da-DK-ChristelNeural", "da-DK-JeppeNeural"],
81
+ "no": ["nb-NO-PernilleNeural", "nb-NO-FinnNeural"],
82
+ "fi": ["fi-FI-NooraNeural", "fi-FI-HarriNeural"],
83
+ "el": ["el-GR-AthinaNeural", "el-GR-NestorasNeural"],
84
+ "cs": ["cs-CZ-VlastaNeural", "cs-CZ-AntoninNeural"],
85
+ "ro": ["ro-RO-AlinaNeural", "ro-RO-EmilNeural"],
86
+ "hu": ["hu-HU-NoemiNeural", "hu-HU-TamasNeural"],
87
+ "th": ["th-TH-PremwadeeNeural", "th-TH-NiwatNeural"],
88
+ "vi": ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"],
89
+ "id": ["id-ID-GadisNeural", "id-ID-ArdiNeural"],
90
+ "ms": ["ms-MY-YasminNeural", "ms-MY-OsmanNeural"],
91
+ "fil": ["fil-PH-BlessicaNeural", "fil-PH-AngeloNeural"],
92
+ "he": ["he-IL-HilaNeural", "he-IL-AvriNeural"],
93
+ "uk": ["uk-UA-PolinaNeural", "uk-UA-OstapNeural"],
94
+ }
95
+
96
+ # STT Provider definitions
97
+ STT_PROVIDERS = {
98
+ "OpenAI Whisper API": {
99
+ "id": "openai_whisper",
100
+ "cost_tier": "medium",
101
+ "cost_per_minute": 0.006,
102
+ "requires_api_key": True,
103
+ },
104
+ "Local Whisper (Tiny)": {
105
+ "id": "local_whisper_tiny",
106
+ "cost_tier": "free",
107
+ "cost_per_minute": 0.0,
108
+ "requires_api_key": False,
109
+ },
110
+ "Local Whisper (Base)": {
111
+ "id": "local_whisper_base",
112
+ "cost_tier": "free",
113
+ "cost_per_minute": 0.0,
114
+ "requires_api_key": False,
115
+ },
116
+ }
117
+
118
+ # TTS Provider definitions
119
+ TTS_PROVIDERS = {
120
+ "Edge-TTS (Free)": {
121
+ "id": "edge_tts",
122
+ "cost_tier": "free",
123
+ "cost_per_1k_chars": 0.0,
124
+ "requires_api_key": False,
125
+ "voices": [] # Will be populated dynamically based on language
126
+ },
127
+ "OpenAI TTS": {
128
+ "id": "openai_tts",
129
+ "cost_tier": "medium",
130
+ "cost_per_1k_chars": 0.015,
131
+ "requires_api_key": True,
132
+ "voices": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
133
+ },
134
+ "gTTS (Free)": {
135
+ "id": "gtts",
136
+ "cost_tier": "free",
137
+ "cost_per_1k_chars": 0.0,
138
+ "requires_api_key": False,
139
+ "voices": ["default"]
140
+ },
141
+ }
142
+
143
+ # Default selections
144
+ DEFAULT_STT = "OpenAI Whisper API"
145
+ DEFAULT_TTS = "Edge-TTS (Free)"
146
+ DEFAULT_TTS_VOICE = "en-US-JennyNeural"
147
+ DEFAULT_LANGUAGE = "English"
148
+
149
+
150
+ # ============================================================================
151
+ # Abstract Base Classes
152
+ # ============================================================================
153
+
154
+ class STTProvider(ABC):
155
+ """Abstract base class for Speech-to-Text providers."""
156
+
157
+ @abstractmethod
158
+ def transcribe(self, audio_path: str) -> str:
159
+ """
160
+ Transcribe audio file to text.
161
+
162
+ Args:
163
+ audio_path: Path to audio file
164
+
165
+ Returns:
166
+ Transcribed text
167
+ """
168
+ pass
169
+
170
+
171
+ class TTSProvider(ABC):
172
+ """Abstract base class for Text-to-Speech providers."""
173
+
174
+ @abstractmethod
175
+ def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
176
+ """
177
+ Synthesize text to speech.
178
+
179
+ Args:
180
+ text: Text to convert to speech
181
+ output_path: Optional path to save audio file
182
+
183
+ Returns:
184
+ Path to generated audio file
185
+ """
186
+ pass
187
+
188
+ @abstractmethod
189
+ def get_available_voices(self) -> List[str]:
190
+ """Get list of available voices for this provider."""
191
+ pass
192
+
193
+
194
+ # ============================================================================
195
+ # STT Provider Implementations
196
+ # ============================================================================
197
+
198
+ class OpenAIWhisperSTT(STTProvider):
199
+ """OpenAI Whisper API implementation."""
200
+
201
+ def __init__(self, api_key: Optional[str] = None):
202
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
203
+ if not self.api_key:
204
+ raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
205
+
206
+ self.client = OpenAI(api_key=self.api_key)
207
+
208
+ def transcribe(self, audio_path: str) -> str:
209
+ """Transcribe audio using OpenAI Whisper API."""
210
+ try:
211
+ with open(audio_path, "rb") as audio_file:
212
+ transcript = self.client.audio.transcriptions.create(
213
+ model="whisper-1",
214
+ file=audio_file
215
+ )
216
+ return transcript.text
217
+ except Exception as e:
218
+ raise Exception(f"OpenAI Whisper transcription failed: {str(e)}")
219
+
220
+
221
+ class LocalWhisperSTT(STTProvider):
222
+ """Local Whisper model implementation."""
223
+
224
+ def __init__(self, model_size: str = "base"):
225
+ """
226
+ Initialize local Whisper model.
227
+
228
+ Args:
229
+ model_size: Model size (tiny, base, small, medium, large)
230
+ """
231
+ self.model_size = model_size
232
+ self.model = None
233
+
234
+ def _load_model(self):
235
+ """Lazy load the model."""
236
+ if self.model is None:
237
+ self.model = whisper.load_model(self.model_size)
238
+
239
+ def transcribe(self, audio_path: str) -> str:
240
+ """Transcribe audio using local Whisper model."""
241
+ self._load_model()
242
+ try:
243
+ result = self.model.transcribe(audio_path)
244
+ return result["text"]
245
+ except Exception as e:
246
+ raise Exception(f"Local Whisper transcription failed: {str(e)}")
247
+
248
+
249
+ # ============================================================================
250
+ # TTS Provider Implementations
251
+ # ============================================================================
252
+
253
+ class EdgeTTSProvider(TTSProvider):
254
+ """Microsoft Edge TTS implementation (free)."""
255
+
256
+ def __init__(self, voice: str = "en-US-JennyNeural"):
257
+ self.voice = voice
258
+
259
+ def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
260
+ """Synthesize speech using Edge TTS."""
261
+
262
+ if output_path is None:
263
+ output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
264
+
265
+ try:
266
+ # Edge TTS requires async
267
+ async def _synthesize():
268
+ communicate = edge_tts.Communicate(text, self.voice)
269
+ await communicate.save(output_path)
270
+
271
+ asyncio.run(_synthesize())
272
+ return output_path
273
+ except Exception as e:
274
+ raise Exception(f"Edge TTS synthesis failed: {str(e)}")
275
+
276
+ def get_available_voices(self) -> List[str]:
277
+ """Get available Edge TTS voices."""
278
+ return VoiceConfig.TTS_PROVIDERS["Edge-TTS (Free)"]["voices"]
279
+
280
+
281
+ class OpenAITTSProvider(TTSProvider):
282
+ """OpenAI TTS implementation."""
283
+
284
+ def __init__(self, voice: str = "nova", api_key: Optional[str] = None):
285
+ self.voice = voice
286
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
287
+ if not self.api_key:
288
+ raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
289
+
290
+ self.client = OpenAI(api_key=self.api_key)
291
+
292
+ def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
293
+ """Synthesize speech using OpenAI TTS."""
294
+ if output_path is None:
295
+ output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
296
+
297
+ try:
298
+ response = self.client.audio.speech.create(
299
+ model="tts-1",
300
+ voice=self.voice,
301
+ input=text
302
+ )
303
+ response.stream_to_file(output_path)
304
+ return output_path
305
+ except Exception as e:
306
+ raise Exception(f"OpenAI TTS synthesis failed: {str(e)}")
307
+
308
+ def get_available_voices(self) -> List[str]:
309
+ """Get available OpenAI TTS voices."""
310
+ return VoiceConfig.TTS_PROVIDERS["OpenAI TTS"]["voices"]
311
+
312
+
313
+ class GTTSProvider(TTSProvider):
314
+ """Google TTS implementation (free, basic quality)."""
315
+
316
+ def __init__(self, voice: str = "default", language: str = "en"):
317
+ self.voice = voice
318
+ self.language = language
319
+
320
+ def synthesize(self, text: str, output_path: Optional[str] = None) -> str:
321
+ """Synthesize speech using gTTS."""
322
+
323
+ if output_path is None:
324
+ output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3")
325
+
326
+ try:
327
+ tts = gTTS(text=text, lang=self.language)
328
+ tts.save(output_path)
329
+ return output_path
330
+ except Exception as e:
331
+ raise Exception(f"gTTS synthesis failed: {str(e)}")
332
+
333
+ def get_available_voices(self) -> List[str]:
334
+ """Get available gTTS voices."""
335
+ return VoiceConfig.TTS_PROVIDERS["gTTS (Free)"]["voices"]
336
+
337
+
338
+ # ============================================================================
339
+ # Factory Functions
340
+ # ============================================================================
341
+
342
+ def create_stt_provider(provider_name: str) -> STTProvider:
343
+ """
344
+ Create an STT provider instance.
345
+
346
+ Args:
347
+ provider_name: Name of the provider (from VoiceConfig.STT_PROVIDERS)
348
+
349
+ Returns:
350
+ STTProvider instance
351
+ """
352
+ provider_id = VoiceConfig.STT_PROVIDERS[provider_name]["id"]
353
+
354
+ if provider_id == "openai_whisper":
355
+ return OpenAIWhisperSTT()
356
+ elif provider_id == "local_whisper_tiny":
357
+ return LocalWhisperSTT(model_size="tiny")
358
+ elif provider_id == "local_whisper_base":
359
+ return LocalWhisperSTT(model_size="base")
360
+ else:
361
+ raise ValueError(f"Unknown STT provider: {provider_name}")
362
+
363
+
364
+ def create_tts_provider(provider_name: str, voice: Optional[str] = None, language: str = "en") -> TTSProvider:
365
+ """
366
+ Create a TTS provider instance.
367
+
368
+ Args:
369
+ provider_name: Name of the provider (from VoiceConfig.TTS_PROVIDERS)
370
+ voice: Optional voice name
371
+ language: Language code (ISO 639-1)
372
+
373
+ Returns:
374
+ TTSProvider instance
375
+ """
376
+ provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"]
377
+ provider_info = VoiceConfig.TTS_PROVIDERS[provider_name]
378
+
379
+ # Use default voice if not specified
380
+ if voice is None:
381
+ voice = provider_info["voices"][0] if provider_info["voices"] else None
382
+
383
+ if provider_id == "edge_tts":
384
+ return EdgeTTSProvider(voice=voice)
385
+ elif provider_id == "openai_tts":
386
+ return OpenAITTSProvider(voice=voice)
387
+ elif provider_id == "gtts":
388
+ return GTTSProvider(voice=voice, language=language)
389
+ else:
390
+ raise ValueError(f"Unknown TTS provider: {provider_name}")
391
+
392
+
393
+ def get_available_stt_providers() -> List[str]:
394
+ """Get list of available STT provider names."""
395
+ return list(VoiceConfig.STT_PROVIDERS.keys())
396
+
397
+
398
+ def get_available_tts_providers() -> List[str]:
399
+ """Get list of available TTS provider names."""
400
+ return list(VoiceConfig.TTS_PROVIDERS.keys())
401
+
402
+
403
+ def get_voices_for_provider(provider_name: str, language: str = "en") -> List[str]:
404
+ """
405
+ Get available voices for a TTS provider, optionally filtered by language.
406
+
407
+ Args:
408
+ provider_name: Name of the provider
409
+ language: Language code (ISO 639-1) for filtering voices
410
+
411
+ Returns:
412
+ List of available voices
413
+ """
414
+ if provider_name not in VoiceConfig.TTS_PROVIDERS:
415
+ return []
416
+
417
+ provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"]
418
+
419
+ # For Edge TTS, return language-specific voices
420
+ if provider_id == "edge_tts":
421
+ return VoiceConfig.EDGE_TTS_VOICES.get(language, VoiceConfig.EDGE_TTS_VOICES.get("en", []))
422
+
423
+ # For other providers, return all voices
424
+ return VoiceConfig.TTS_PROVIDERS[provider_name]["voices"]
425
+
426
+
427
+ def get_provider_info(provider_name: str, provider_type: str = "tts") -> Dict:
428
+ """
429
+ Get information about a provider.
430
+
431
+ Args:
432
+ provider_name: Name of the provider
433
+ provider_type: "stt" or "tts"
434
+
435
+ Returns:
436
+ Provider information dictionary
437
+ """
438
+ if provider_type == "tts":
439
+ return VoiceConfig.TTS_PROVIDERS.get(provider_name, {})
440
+ else:
441
+ return VoiceConfig.STT_PROVIDERS.get(provider_name, {})
442
+
443
+
444
+ def get_available_languages() -> List[str]:
445
+ """Get list of available language names."""
446
+ return list(VoiceConfig.LANGUAGES.keys())
447
+
448
+
449
+ def get_language_code(language_name: str) -> str:
450
+ """
451
+ Get language code from language name.
452
+
453
+ Args:
454
+ language_name: Display name of the language (e.g., "English")
455
+
456
+ Returns:
457
+ Language code (e.g., "en")
458
+ """
459
+ return VoiceConfig.LANGUAGES.get(language_name, "en")
460
+
461
+
462
+ def get_default_voice_for_language(language_name: str, provider_name: str = "Edge-TTS (Free)") -> str:
463
+ """
464
+ Get the default voice for a specific language and provider.
465
+
466
+ Args:
467
+ language_name: Display name of the language
468
+ provider_name: Name of the TTS provider
469
+
470
+ Returns:
471
+ Default voice ID for the language
472
+ """
473
+ language_code = get_language_code(language_name)
474
+ voices = get_voices_for_provider(provider_name, language_code)
475
+
476
+ if voices:
477
+ return voices[0]
478
+
479
+ # Fallback to English if language not supported
480
+ return VoiceConfig.DEFAULT_TTS_VOICE