TTSModel / app.py
farid678's picture
Update app.py
540093e verified
import torch
import numpy as np
import gradio as gr
from transformers import pipeline
import logging
from scipy.io.wavfile import write
import uuid
import os
import warnings
# -----------------------------
# SUPPRESS WARNINGS
# -----------------------------
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
# -----------------------------
# DEVICE SETUP
# -----------------------------
device = 0 if torch.cuda.is_available() else -1
# -----------------------------
# PATH TO FINE-TUNED MODEL
# -----------------------------
model_dir = "./" # مسیر فایل‌های fine-tuned Orpheus در Space
# -----------------------------
# LOAD TTS PIPELINE
# -----------------------------
tts_pipe = pipeline(
task="text-to-speech",
model=model_dir,
device=device
)
# -----------------------------
# INFERENCE FUNCTION
# -----------------------------
def tts_generate(text):
if not text.strip():
return None
# اجرای مدل TTS
output = tts_pipe(text)
if "audio" not in output:
raise ValueError("TTS pipeline did not return audio")
audio = np.array(output["audio"], dtype=np.float32)
# sanitize audio to avoid RuntimeWarning
audio = np.nan_to_num(audio) # convert NaN/Inf to 0
audio = np.clip(audio, -1.0, 1.0) # limit values to [-1,1]
# بررسی و مقدار پیش‌فرض sampling rate
sr = output.get("sampling_rate") or 22050
# تبدیل float32 به int16
audio_int16 = (audio * 32767).astype(np.int16)
# ساخت پوشه خروجی
os.makedirs("outputs", exist_ok=True)
out_path = f"outputs/{uuid.uuid4().hex}.wav"
# ذخیره WAV
write(out_path, sr, audio_int16)
return out_path
# -----------------------------
# SAMPLE TEXTS
# -----------------------------
SAMPLES = [
"Just end up crashing somewhere. <laugh> No, because remember last time?",
"Hmm… I don't know. <laugh> This feels like a bad idea. <gasp>",
"I'm so tired today <yawn> but I still have so much work to do.",
]
# -----------------------------
# GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
fn=tts_generate,
inputs=gr.Textbox(
label="Enter text (use expressive tags like <laugh>, <sigh>)",
lines=5,
placeholder=SAMPLES[0],
),
outputs=gr.Audio(type="filepath", label="Generated Audio"),
title="Fine-tuned Orpheus-3B Expressive TTS",
examples=[[s] for s in SAMPLES],
)
# -----------------------------
# CLEAN RUN
# -----------------------------
if __name__ == "__main__":
demo.launch(ssr_mode=False) # کاهش خطاهای asyncio / Invalid file descriptor