tts / src /generation /handler.py
hadadrjt's picture
Pocket TTS: Let's take this seriously.
5da0109
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import gradio as gr
from config import VOICE_MODE_CLONE
from ..core.state import (
generation_state_lock,
get_stop_generation_requested,
set_stop_generation_requested
)
from ..core.authentication import get_huggingface_token
from ..core.memory import (
has_temporary_files_pending_cleanup,
cleanup_expired_temporary_files,
perform_memory_cleanup,
memory_cleanup,
trigger_background_cleanup_check
)
from ..tts.manager import text_to_speech_manager
from ..validation.text import validate_text_input
def check_if_generating():
from ..core.state import is_currently_generating
with generation_state_lock:
return is_currently_generating
def request_generation_stop():
set_stop_generation_requested(True)
return gr.update(interactive=False)
def perform_speech_generation(
text_input,
voice_mode_selection,
voice_preset_selection,
voice_clone_audio_file,
model_variant,
lsd_decode_steps,
temperature,
noise_clamp,
eos_threshold,
frames_after_eos,
enable_custom_frames
):
from ..core import state as global_state
if has_temporary_files_pending_cleanup():
cleanup_expired_temporary_files()
perform_memory_cleanup()
is_valid, validation_result = validate_text_input(text_input)
if not is_valid:
if validation_result:
raise gr.Error(validation_result)
raise gr.Error("Please enter valid text to generate speech.")
if voice_mode_selection == VOICE_MODE_CLONE:
if not voice_clone_audio_file:
raise gr.Error("Please upload an audio file for voice cloning.")
if not get_huggingface_token():
raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
with generation_state_lock:
if global_state.is_currently_generating:
raise gr.Error("A generation is already in progress. Please wait.")
global_state.is_currently_generating = True
global_state.stop_generation_requested = False
generated_audio_tensor = None
cloned_voice_state_tensor = None
try:
text_to_speech_manager.load_or_get_model(
model_variant,
temperature,
lsd_decode_steps,
noise_clamp,
eos_threshold
)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
if voice_mode_selection == VOICE_MODE_CLONE:
cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
voice_state = cloned_voice_state_tensor
else:
voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
generated_audio_tensor = text_to_speech_manager.generate_audio(
validation_result,
voice_state,
frames_after_eos,
enable_custom_frames
)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)
return output_file_path
except gr.Error:
raise
except RuntimeError as runtime_error:
raise gr.Error(str(runtime_error))
except Exception as generation_error:
raise gr.Error(f"Speech generation failed: {str(generation_error)}")
finally:
with generation_state_lock:
global_state.is_currently_generating = False
global_state.stop_generation_requested = False
if generated_audio_tensor is not None:
del generated_audio_tensor
generated_audio_tensor = None
if cloned_voice_state_tensor is not None:
del cloned_voice_state_tensor
cloned_voice_state_tensor = None
memory_cleanup()
trigger_background_cleanup_check()