LugandaTTS / app.py
Bateesa's picture
Update app.py
aeecd38 verified
import gradio as gr
import torch
from qwen_tts import Qwen3TTSModel
MODEL_ID = "Bateesa/QWEN-TTS_Luganda_Base"
tts = Qwen3TTSModel.from_pretrained(
MODEL_ID,
device_map="auto", # or "auto"
dtype=torch.bfloat16,
)
def plain_tts(text, language):
# Plain TTS (no cloning). For English use "english"; for Luganda try "auto" + your fine-tuned weights.
wavs, sr = tts.generate_voice_clone(
text=text,
language=language, # e.g. "english" or "auto"
ref_audio=None,
x_vector_only_mode=True,
)
return sr, wavs[0]
def clone_tts(ref_audio, ref_text, text, language, xvec_only):
# Voice cloning: ref_audio + ref_text define the target voice
if ref_audio is None:
return None, None
wavs, sr = tts.generate_voice_clone(
text=text,
language=language, # often "auto"
ref_audio=ref_audio, # (np.array, sr) from Gradio
ref_text=ref_text,
x_vector_only_mode=xvec_only, # False => stronger ICL, better likeness; True => timbre only
)
return sr, wavs[0]
with gr.Blocks() as demo:
gr.Markdown("## Luganda / English TTS & Voice Cloning (Qwen3-TTS Fine-tuned)")
with gr.Tab("Plain TTS"):
txt = gr.Textbox(label="Text")
lang = gr.Dropdown(choices=["auto", "english"], value="auto", label="Language")
btn = gr.Button("Generate")
audio_out = gr.Audio(label="Output", type="numpy")
btn.click(plain_tts, inputs=[txt, lang], outputs=audio_out)
with gr.Tab("Voice Cloning"):
ref_audio = gr.Audio(label="Reference audio", type="numpy")
ref_text = gr.Textbox(label="Reference transcript (same language as ref audio)")
clone_text = gr.Textbox(label="Text to speak")
clone_lang = gr.Dropdown(choices=["auto", "english"], value="auto", label="Language")
xvec_only = gr.Checkbox(value=False, label="x-vector only (no ICL, timbre only)")
clone_btn = gr.Button("Clone Voice")
clone_out = gr.Audio(label="Cloned output", type="numpy")
clone_btn.click(
clone_tts,
inputs=[ref_audio, ref_text, clone_text, clone_lang, xvec_only],
outputs=clone_out,
)
demo.launch()