hadadrjt commited on
Commit
5da0109
·
1 Parent(s): d24817e

Pocket TTS: Let's take this seriously.

Browse files
.dockerignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Dockerfile
2
+ LICENSE
3
+ README.md
Dockerfile CHANGED
@@ -7,4 +7,4 @@ FROM hadadrjt/pocket-tts:hf
7
 
8
  WORKDIR /app
9
 
10
- COPY app.py .
 
7
 
8
  WORKDIR /app
9
 
10
+ COPY . .
app.py CHANGED
@@ -1,1565 +1,227 @@
1
- """
2
- ============================================================================
3
- AI-GENERATED CODE
4
- ============================================================================
5
- """
6
-
7
- """
8
- Pocket TTS Web Application
9
- ==========================
10
-
11
- A Gradio-based web interface for the Pocket TTS text-to-speech model.
12
- This application provides an intuitive interface for generating speech
13
- from text using either preset voices or voice cloning capabilities.
14
-
15
- Features:
16
- ---------
17
- - Multiple preset voice options
18
- - Voice cloning from uploaded audio files
19
- - Configurable generation parameters (temperature, LSD steps, etc.)
20
- - Real-time character counting and validation
21
- - Temporary file management with automatic cleanup
22
- - Thread-safe generation state management
23
-
24
- Usage:
25
- ------
26
- Run this script directly to launch the web application:
27
- $ python app.py
28
-
29
- The application will be available at http://localhost:7860
30
- """
31
-
32
- import os
33
- import time
34
- import torch
35
- import tempfile
36
- import threading
37
- import scipy.io.wavfile
38
- import gradio as gr
39
- from pocket_tts import TTSModel
40
-
41
-
42
- # =============================================================================
43
- # =============================================================================
44
-
45
  #
46
  # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
47
  # SPDX-License-Identifier: Apache-2.0
48
  #
49
 
50
- from huggingface_hub import login
51
-
52
- HF_TOKEN = os.getenv("HF_TOKEN", None)
53
-
54
- if HF_TOKEN:
55
- try:
56
- login(token=HF_TOKEN, add_to_git_credential=False)
57
- print("Authenticated with Hugging Face")
58
-
59
- except Exception as auth_error:
60
- print(f"Hugging Face authentication failed: {auth_error}")
61
- print("Voice cloning may not be available")
62
-
63
- else:
64
- print("Missing Hugging Face authentication required for the license agreement")
65
-
66
- # =============================================================================
67
- # =============================================================================
68
-
69
-
70
- # =============================================================================
71
- # ENVIRONMENT CONFIGURATION
72
- # =============================================================================
73
- # Configure PyTorch threading behavior
74
- torch.set_num_threads(1) # Intra-op parallelism threads
75
- torch.set_num_interop_threads(1) # Inter-op parallelism threads
76
-
77
-
78
- # =============================================================================
79
- # APPLICATION CONSTANTS
80
- # =============================================================================
81
- # Define all configurable constants and default values used throughout
82
- # the application. These values control model behavior, UI constraints,
83
- # and resource management policies.
84
-
85
- # Available preset voice options for speech generation
86
- AVAILABLE_VOICES = [
87
- "alba",
88
- "marius",
89
- "javert",
90
- "jean",
91
- "fantine",
92
- "cosette",
93
- "eponine",
94
- "azelma"
95
- ]
96
-
97
- # Default configuration values
98
- DEFAULT_VOICE = "alba" # Default preset voice selection
99
- DEFAULT_MODEL_VARIANT = "b6369a24" # Model variant identifier
100
- DEFAULT_TEMPERATURE = 0.7 # Generation temperature
101
- DEFAULT_LSD_DECODE_STEPS = 1 # Latent space decode steps
102
- DEFAULT_EOS_THRESHOLD = -4.0 # End-of-sequence detection threshold
103
- DEFAULT_NOISE_CLAMP = 0.0 # Noise clamping value (0 = disabled)
104
- DEFAULT_FRAMES_AFTER_EOS = 10 # Additional frames after EOS
105
-
106
- # Input constraints and resource management
107
- MAXIMUM_INPUT_LENGTH = 1000 # Maximum text input characters
108
- TEMPORARY_FILE_LIFETIME_SECONDS = 7200 # Temp file retention (2 hours)
109
-
110
- # Voice mode selection options
111
- VOICE_MODE_PRESET = "Preset Voices" # Use predefined voice
112
- VOICE_MODE_CLONE = "Voice Cloning" # Clone voice from audio
113
-
114
- # Example prompts with associated voice presets for demonstration
115
- EXAMPLE_PROMPTS_WITH_VOICES = [
116
- {
117
- "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
118
- "voice": "alba"
119
- },
120
- {
121
- "text": "Welcome to the future of text to speech technology powered by artificial intelligence.",
122
- "voice": "marius"
123
- },
124
- {
125
- "text": "Technology continues to push the boundaries of what we thought was possible.",
126
- "voice": "javert"
127
- },
128
- {
129
- "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.",
130
- "voice": "fantine"
131
- },
132
- {
133
- "text": "Science and innovation are transforming how we interact with the world around us.",
134
- "voice": "jean"
135
- }
136
- ]
137
-
138
-
139
- # =============================================================================
140
- # THREAD SYNCHRONIZATION
141
- # =============================================================================
142
- # Global state management for thread-safe generation operations.
143
- # These locks and flags prevent concurrent generation requests and
144
- # enable graceful cancellation of ongoing operations.
145
-
146
- generation_state_lock = threading.Lock() # Lock for generation state access
147
- is_currently_generating = False # Flag indicating active generation
148
- stop_generation_requested = False # Flag for stop request signaling
149
-
150
- # Temporary file registry for cleanup management
151
- temporary_files_registry = {} # Maps file paths to creation timestamps
152
- temporary_files_lock = threading.Lock() # Lock for registry access
153
-
154
-
155
- # =============================================================================
156
- # =============================================================================
157
-
158
- #
159
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
160
- # SPDX-License-Identifier: Apache-2.0
161
- #
162
-
163
- import gc
164
- import atexit
165
-
166
- BACKGROUND_CLEANUP_INTERVAL = 300
167
- VOICE_STATE_CACHE_MAXIMUM_SIZE = 8
168
- VOICE_STATE_CACHE_CLEANUP_THRESHOLD = 4
169
-
170
- MAXIMUM_MEMORY_USAGE = 1 * 1024 * 1024 * 1024
171
-
172
- MEMORY_WARNING_THRESHOLD = int(0.7 * MAXIMUM_MEMORY_USAGE)
173
- MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
174
- MEMORY_CHECK_INTERVAL = 30
175
- MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
176
-
177
- background_cleanup_thread = None
178
- background_cleanup_stop_event = threading.Event()
179
- background_cleanup_trigger_event = threading.Event()
180
-
181
- memory_enforcement_lock = threading.Lock()
182
-
183
- text_to_speech_manager = None
184
-
185
- def get_current_memory_usage():
186
- try:
187
- with open('/proc/self/status', 'r') as status_file:
188
- for line in status_file:
189
-
190
- if line.startswith('VmRSS:'):
191
- memory_value_kb = int(line.split()[1])
192
- return memory_value_kb * 1024
193
-
194
- except Exception:
195
- pass
196
-
197
- try:
198
- with open('/proc/self/statm', 'r') as statm_file:
199
- statm_values = statm_file.read().split()
200
- resident_pages = int(statm_values[1])
201
- page_size = os.sysconf('SC_PAGE_SIZE')
202
- return resident_pages * page_size
203
-
204
- except Exception:
205
- pass
206
-
207
- try:
208
- import resource
209
- memory_usage_kilobytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
210
-
211
- import platform
212
- if platform.system() == "Darwin":
213
- return memory_usage_kilobytes
214
-
215
- else:
216
- return memory_usage_kilobytes * 1024
217
-
218
- except Exception:
219
- pass
220
-
221
- return 0
222
-
223
- def check_if_generation_is_currently_active():
224
- with generation_state_lock:
225
- return is_currently_generating
226
-
227
- def is_memory_usage_within_limit():
228
- current_memory_usage = get_current_memory_usage()
229
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
230
-
231
- def is_memory_usage_approaching_limit():
232
- current_memory_usage = get_current_memory_usage()
233
- return current_memory_usage >= MEMORY_WARNING_THRESHOLD
234
-
235
- def is_memory_usage_critical():
236
- current_memory_usage = get_current_memory_usage()
237
- return current_memory_usage >= MEMORY_CRITICAL_THRESHOLD
238
-
239
- def is_memory_above_idle_target():
240
- current_memory_usage = get_current_memory_usage()
241
- return current_memory_usage > MEMORY_IDLE_TARGET
242
-
243
- def force_garbage_collection():
244
- gc.collect(0)
245
- gc.collect(1)
246
- gc.collect(2)
247
-
248
- if torch.cuda.is_available():
249
- torch.cuda.empty_cache()
250
- torch.cuda.synchronize()
251
-
252
- def memory_cleanup():
253
- force_garbage_collection()
254
-
255
- try:
256
- import ctypes
257
-
258
- libc = ctypes.CDLL("libc.so.6")
259
- libc.malloc_trim(0)
260
-
261
- except Exception:
262
- pass
263
-
264
- force_garbage_collection()
265
-
266
- def perform_memory_cleanup():
267
- global text_to_speech_manager
268
-
269
- force_garbage_collection()
270
-
271
- if text_to_speech_manager is not None:
272
- text_to_speech_manager.evict_least_recently_used_voice_states()
273
-
274
- memory_cleanup()
275
-
276
- def enforce_memory_limit_if_exceeded():
277
- global text_to_speech_manager
278
-
279
- with memory_enforcement_lock:
280
- generation_is_active = check_if_generation_is_currently_active()
281
-
282
- current_memory_usage = get_current_memory_usage()
283
-
284
- if current_memory_usage < MEMORY_WARNING_THRESHOLD:
285
- return True
286
-
287
- force_garbage_collection()
288
- current_memory_usage = get_current_memory_usage()
289
-
290
- if current_memory_usage < MEMORY_WARNING_THRESHOLD:
291
- return True
292
-
293
- if text_to_speech_manager is not None:
294
- text_to_speech_manager.evict_least_recently_used_voice_states()
295
-
296
- memory_cleanup()
297
- current_memory_usage = get_current_memory_usage()
298
-
299
- if current_memory_usage < MEMORY_CRITICAL_THRESHOLD:
300
- return True
301
-
302
- if text_to_speech_manager is not None:
303
- text_to_speech_manager.clear_voice_state_cache_completely()
304
-
305
- cleanup_all_temporary_files_immediately()
306
- memory_cleanup()
307
- current_memory_usage = get_current_memory_usage()
308
-
309
- if current_memory_usage < MAXIMUM_MEMORY_USAGE:
310
- return True
311
-
312
- if generation_is_active:
313
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
314
-
315
- if text_to_speech_manager is not None:
316
- text_to_speech_manager.unload_model_completely()
317
-
318
- memory_cleanup()
319
- current_memory_usage = get_current_memory_usage()
320
-
321
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
322
-
323
- def perform_idle_memory_reduction():
324
- global text_to_speech_manager
325
-
326
- if check_if_generation_is_currently_active():
327
- return
328
-
329
- with memory_enforcement_lock:
330
- current_memory_usage = get_current_memory_usage()
331
-
332
- if current_memory_usage <= MEMORY_IDLE_TARGET:
333
- return
334
-
335
- force_garbage_collection()
336
- current_memory_usage = get_current_memory_usage()
337
-
338
- if current_memory_usage <= MEMORY_IDLE_TARGET:
339
- return
340
-
341
- if check_if_generation_is_currently_active():
342
- return
343
-
344
- if text_to_speech_manager is not None:
345
- text_to_speech_manager.evict_least_recently_used_voice_states()
346
-
347
- memory_cleanup()
348
- current_memory_usage = get_current_memory_usage()
349
-
350
- if current_memory_usage <= MEMORY_IDLE_TARGET:
351
- return
352
-
353
- if check_if_generation_is_currently_active():
354
- return
355
-
356
- if text_to_speech_manager is not None:
357
- text_to_speech_manager.clear_voice_state_cache_completely()
358
-
359
- memory_cleanup()
360
- current_memory_usage = get_current_memory_usage()
361
-
362
- if current_memory_usage <= MEMORY_IDLE_TARGET:
363
- return
364
-
365
- if check_if_generation_is_currently_active():
366
- return
367
-
368
- if text_to_speech_manager is not None:
369
- text_to_speech_manager.unload_model_completely()
370
-
371
- memory_cleanup()
372
-
373
- def cleanup_all_temporary_files_immediately():
374
- with temporary_files_lock:
375
- for file_path in list(temporary_files_registry.keys()):
376
- try:
377
- if os.path.exists(file_path):
378
- os.remove(file_path)
379
- del temporary_files_registry[file_path]
380
-
381
- except Exception:
382
- pass
383
-
384
- def has_temporary_files_pending_cleanup():
385
- with temporary_files_lock:
386
-
387
- if len(temporary_files_registry) == 0:
388
- return False
389
-
390
- current_timestamp = time.time()
391
-
392
- for file_path, creation_timestamp in temporary_files_registry.items():
393
- if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
394
- return True
395
-
396
- return False
397
 
398
- def has_any_temporary_files_registered():
399
- with temporary_files_lock:
400
- return len(temporary_files_registry) > 0
401
 
402
- def calculate_time_until_next_file_expiration():
403
- with temporary_files_lock:
404
- if len(temporary_files_registry) == 0:
405
- return None
406
-
407
- current_timestamp = time.time()
408
- minimum_time_until_expiration = None
409
-
410
- for file_path, creation_timestamp in temporary_files_registry.items():
411
- time_since_creation = current_timestamp - creation_timestamp
412
- time_until_expiration = TEMPORARY_FILE_LIFETIME_SECONDS - time_since_creation
413
-
414
- if time_until_expiration <= 0:
415
- return 0
416
-
417
- if minimum_time_until_expiration is None or time_until_expiration < minimum_time_until_expiration:
418
- minimum_time_until_expiration = time_until_expiration
419
-
420
- return minimum_time_until_expiration
421
 
422
- def perform_background_cleanup_cycle():
423
- last_memory_check_timestamp = 0
424
-
425
- while not background_cleanup_stop_event.is_set():
426
- time_until_next_expiration = calculate_time_until_next_file_expiration()
427
- current_timestamp = time.time()
428
- time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
429
-
430
- if time_until_next_expiration is not None:
431
- if time_until_next_expiration <= 0:
432
- wait_duration = 1
433
 
434
- else:
435
- wait_duration = min(
436
- time_until_next_expiration + 1,
437
- MEMORY_CHECK_INTERVAL,
438
- BACKGROUND_CLEANUP_INTERVAL
 
 
 
 
 
439
  )
440
- else:
441
- if is_memory_above_idle_target() and not check_if_generation_is_currently_active():
442
- wait_duration = MEMORY_CHECK_INTERVAL
443
-
444
- else:
445
- background_cleanup_trigger_event.clear()
446
- triggered = background_cleanup_trigger_event.wait(timeout=BACKGROUND_CLEANUP_INTERVAL)
447
-
448
- if background_cleanup_stop_event.is_set():
449
- break
450
-
451
- if triggered:
452
- continue
453
-
454
- else:
455
- if not check_if_generation_is_currently_active():
456
- perform_idle_memory_reduction()
457
 
458
- continue
459
-
460
- background_cleanup_stop_event.wait(timeout=wait_duration)
461
-
462
- if background_cleanup_stop_event.is_set():
463
- break
464
-
465
- if has_temporary_files_pending_cleanup():
466
- cleanup_expired_temporary_files()
467
-
468
- current_timestamp = time.time()
469
- time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
470
-
471
- if time_since_last_memory_check >= MEMORY_CHECK_INTERVAL:
472
- if not check_if_generation_is_currently_active():
473
-
474
- if is_memory_usage_critical():
475
- enforce_memory_limit_if_exceeded()
476
-
477
- elif is_memory_above_idle_target():
478
- perform_idle_memory_reduction()
479
-
480
- last_memory_check_timestamp = current_timestamp
481
-
482
- def trigger_background_cleanup_check():
483
- background_cleanup_trigger_event.set()
484
-
485
- def start_background_cleanup_thread():
486
- global background_cleanup_thread
487
-
488
- if background_cleanup_thread is None or not background_cleanup_thread.is_alive():
489
- background_cleanup_stop_event.clear()
490
- background_cleanup_trigger_event.clear()
491
-
492
- background_cleanup_thread = threading.Thread(
493
- target=perform_background_cleanup_cycle,
494
- daemon=True,
495
- name="BackgroundCleanupThread"
496
- )
497
-
498
- background_cleanup_thread.start()
499
-
500
- def stop_background_cleanup_thread():
501
- background_cleanup_stop_event.set()
502
- background_cleanup_trigger_event.set()
503
-
504
- if background_cleanup_thread is not None and background_cleanup_thread.is_alive():
505
- background_cleanup_thread.join(timeout=5)
506
-
507
- atexit.register(stop_background_cleanup_thread)
508
-
509
- # =============================================================================
510
- # =============================================================================
511
-
512
-
513
- # =============================================================================
514
- # =============================================================================
515
-
516
- #
517
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
518
- # SPDX-License-Identifier: Apache-2.0
519
- #
520
-
521
- import numpy as np
522
-
523
- def convert_audio_to_pcm_wav(input_path):
524
- try:
525
- sample_rate, audio_data = scipy.io.wavfile.read(input_path)
526
-
527
- if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
528
- audio_data = np.clip(audio_data, -1.0, 1.0)
529
- audio_data = (audio_data * 32767).astype(np.int16)
530
-
531
- elif audio_data.dtype == np.int32:
532
- audio_data = (audio_data >> 16).astype(np.int16)
533
-
534
- elif audio_data.dtype == np.uint8:
535
- audio_data = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
536
-
537
- elif audio_data.dtype != np.int16:
538
- audio_data = audio_data.astype(np.int16)
539
-
540
- output_file = tempfile.NamedTemporaryFile(suffix="_converted.wav", delete=False)
541
- scipy.io.wavfile.write(output_file.name, sample_rate, audio_data)
542
-
543
- with temporary_files_lock:
544
- temporary_files_registry[output_file.name] = time.time()
545
-
546
- trigger_background_cleanup_check()
547
-
548
- return output_file.name
549
-
550
- except Exception as conversion_error:
551
- print(f"Warning: {conversion_error}")
552
- return input_path
553
-
554
- # =============================================================================
555
- # =============================================================================
556
-
557
-
558
- # =============================================================================
559
- # TEXT-TO-SPEECH MANAGER CLASS
560
- # =============================================================================
561
-
562
- class TextToSpeechManager:
563
- """
564
- Manages TTS model lifecycle and speech generation operations.
565
-
566
- This class handles model loading, configuration caching, voice state
567
- management, and audio generation. It implements lazy loading and
568
- caching strategies to optimize performance and memory usage.
569
-
570
- Attributes:
571
- loaded_model: Currently loaded TTS model instance
572
- current_configuration: Dict of current model configuration
573
- voice_state_cache: Cache of computed voice states for preset voices
574
-
575
- Example:
576
- >>> manager = TextToSpeechManager()
577
- >>> manager.load_or_get_model("b6369a24", 0.7, 1, None, -4.0)
578
- >>> voice_state = manager.get_voice_state_for_preset("alba")
579
- >>> audio = manager.generate_audio("Hello world", voice_state, 10, False)
580
- """
581
-
582
- def __init__(self):
583
- """Initialize the TTS manager with empty state."""
584
- self.loaded_model = None
585
- self.current_configuration = {}
586
- self.voice_state_cache = {}
587
-
588
- self.voice_state_cache_access_timestamps = {}
589
- self.voice_state_cache_lock = threading.Lock()
590
- self.model_lock = threading.Lock()
591
-
592
- def is_model_loaded(self):
593
- with self.model_lock:
594
- return self.loaded_model is not None
595
-
596
- def unload_model_completely(self):
597
- with self.model_lock:
598
- self.clear_voice_state_cache_completely()
599
-
600
- if self.loaded_model is not None:
601
- del self.loaded_model
602
- self.loaded_model = None
603
-
604
- self.current_configuration = {}
605
-
606
- memory_cleanup()
607
-
608
- def load_or_get_model(
609
- self,
610
- model_variant,
611
- temperature,
612
- lsd_decode_steps,
613
- noise_clamp,
614
- eos_threshold
615
- ):
616
- """
617
- Load a TTS model or return cached instance if configuration matches.
618
-
619
- This method implements lazy loading with configuration-based caching.
620
- If the requested configuration differs from the currently loaded model,
621
- a new model instance is created and the voice state cache is cleared.
622
-
623
- Args:
624
- model_variant: Model variant identifier string
625
- temperature: Generation temperature (float, 0.1-2.0)
626
- lsd_decode_steps: Number of LSD decode steps (int, 1-20)
627
- noise_clamp: Maximum noise value or None to disable
628
- eos_threshold: End-of-sequence detection threshold (float)
629
-
630
- Returns:
631
- TTSModel: Loaded and configured TTS model instance
632
- """
633
- perform_memory_cleanup()
634
-
635
- # Process and validate input parameters with defaults
636
- processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
637
- processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
638
- processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
639
- processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
640
- processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD
641
-
642
- # Build configuration dictionary for comparison
643
- requested_configuration = {
644
- "variant": processed_variant,
645
- "temp": processed_temperature,
646
- "lsd_decode_steps": processed_lsd_steps,
647
- "noise_clamp": processed_noise_clamp,
648
- "eos_threshold": processed_eos_threshold
649
- }
650
-
651
- with self.model_lock:
652
- # Load new model if configuration changed or no model loaded
653
- if self.loaded_model is None or self.current_configuration != requested_configuration:
654
- if self.loaded_model is not None:
655
- self.clear_voice_state_cache_completely()
656
- del self.loaded_model
657
- self.loaded_model = None
658
- memory_cleanup()
659
-
660
- self.loaded_model = TTSModel.load_model(**requested_configuration)
661
- self.current_configuration = requested_configuration
662
- self.voice_state_cache = {} # Clear cache on model change
663
-
664
- return self.loaded_model
665
-
666
- def clear_voice_state_cache_completely(self):
667
- with self.voice_state_cache_lock:
668
-
669
- for voice_name in list(self.voice_state_cache.keys()):
670
- voice_state_tensor = self.voice_state_cache.pop(voice_name, None)
671
 
672
- if voice_state_tensor is not None:
673
- del voice_state_tensor
 
 
 
674
 
675
- self.voice_state_cache.clear()
676
- self.voice_state_cache_access_timestamps.clear()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
 
678
- force_garbage_collection()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
 
680
- def evict_least_recently_used_voice_states(self):
681
- with self.voice_state_cache_lock:
 
 
 
 
682
 
683
- if len(self.voice_state_cache) <= VOICE_STATE_CACHE_CLEANUP_THRESHOLD:
684
- if len(self.voice_state_cache) > 0:
685
- sorted_voice_names_by_access_time = sorted(
686
- self.voice_state_cache_access_timestamps.keys(),
687
- key=lambda voice_name: self.voice_state_cache_access_timestamps[voice_name]
688
  )
689
 
690
- number_of_entries_to_remove = max(1, len(self.voice_state_cache) // 2)
691
-
692
- for index in range(min(number_of_entries_to_remove, len(sorted_voice_names_by_access_time))):
693
- voice_name_to_remove = sorted_voice_names_by_access_time[index]
694
- voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
695
- self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
696
-
697
- if voice_state_tensor is not None:
698
- del voice_state_tensor
699
-
700
- force_garbage_collection()
701
- return
702
 
703
- sorted_voice_names_by_access_time = sorted(
704
- self.voice_state_cache_access_timestamps.keys(),
705
- key=lambda voice_name: self.voice_state_cache_access_timestamps[voice_name]
 
 
 
 
 
706
  )
707
 
708
- number_of_entries_to_remove = len(self.voice_state_cache) - VOICE_STATE_CACHE_CLEANUP_THRESHOLD
709
-
710
- for index in range(number_of_entries_to_remove):
711
- voice_name_to_remove = sorted_voice_names_by_access_time[index]
712
- voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
713
- self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
714
-
715
- if voice_state_tensor is not None:
716
- del voice_state_tensor
717
-
718
- force_garbage_collection()
719
-
720
- def get_voice_state_for_preset(self, voice_name):
721
- """
722
- Get or compute voice state for a preset voice.
723
-
724
- Voice states are cached to avoid redundant computation for
725
- frequently used preset voices.
726
-
727
- Args:
728
- voice_name: Name of the preset voice (must be in AVAILABLE_VOICES)
729
-
730
- Returns:
731
- Voice state tensor for the specified preset voice
732
- """
733
- # Validate voice name and fall back to default if invalid
734
- validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE
735
-
736
- with self.voice_state_cache_lock:
737
- if validated_voice in self.voice_state_cache:
738
- self.voice_state_cache_access_timestamps[validated_voice] = time.time()
739
- return self.voice_state_cache[validated_voice]
740
-
741
- if is_memory_usage_approaching_limit():
742
- self.evict_least_recently_used_voice_states()
743
-
744
- if len(self.voice_state_cache) >= VOICE_STATE_CACHE_MAXIMUM_SIZE:
745
- self.evict_least_recently_used_voice_states()
746
-
747
- with self.model_lock:
748
- if self.loaded_model is None:
749
- raise RuntimeError("TTS model is not loaded. Please try again.")
750
-
751
- # Compute and cache voice state if not already cached
752
- if validated_voice not in self.voice_state_cache:
753
-
754
- computed_voice_state = self.loaded_model.get_state_for_audio_prompt(
755
- audio_conditioning=validated_voice,
756
- truncate=False
757
  )
758
-
759
- with self.voice_state_cache_lock:
760
- self.voice_state_cache[validated_voice] = computed_voice_state
761
- self.voice_state_cache_access_timestamps[validated_voice] = time.time()
762
-
763
- return self.voice_state_cache[validated_voice]
764
-
765
- def get_voice_state_for_clone(self, audio_file_path):
766
- """
767
- Compute voice state from an uploaded audio file for voice cloning.
768
-
769
- Unlike preset voices, cloned voice states are not cached as they
770
- are typically unique per request. The audio file is first converted
771
- to PCM WAV format to ensure compatibility with the model.
772
-
773
- Args:
774
- audio_file_path: Path to the uploaded audio file
775
-
776
- Returns:
777
- Voice state tensor extracted from the audio file
778
- """
779
- with self.model_lock:
780
- if self.loaded_model is None:
781
- raise RuntimeError("TTS model is not loaded. Please try again.")
782
-
783
- converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
784
-
785
- return self.loaded_model.get_state_for_audio_prompt(
786
- audio_conditioning=converted_audio_path,
787
- truncate=False
788
- )
789
 
790
- def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
791
- """
792
- Generate speech audio from text using the specified voice state.
793
-
794
- Args:
795
- text_content: Text string to convert to speech
796
- voice_state: Pre-computed voice state tensor
797
- frames_after_eos: Number of frames to generate after EOS
798
- enable_custom_frames: Whether to use custom frame count
799
-
800
- Returns:
801
- torch.Tensor: Generated audio waveform
802
- """
803
- with self.model_lock:
804
- if self.loaded_model is None:
805
- raise RuntimeError("TTS model is not loaded. Please try again.")
806
-
807
- # Apply custom frames setting if enabled
808
- processed_frames = int(frames_after_eos) if enable_custom_frames else None
809
-
810
- generated_audio = self.loaded_model.generate_audio(
811
- model_state=voice_state,
812
- text_to_generate=text_content,
813
- frames_after_eos=processed_frames,
814
- copy_state=True
815
  )
816
-
817
- force_garbage_collection()
818
-
819
- return generated_audio
820
-
821
- def save_audio_to_file(self, audio_tensor):
822
- """
823
- Save generated audio tensor to a temporary WAV file.
824
-
825
- The file is registered for automatic cleanup after the configured
826
- lifetime expires.
827
-
828
- Args:
829
- audio_tensor: PyTorch tensor containing audio waveform
830
-
831
- Returns:
832
- str: Path to the saved temporary WAV file
833
- """
834
- with self.model_lock:
835
- if self.loaded_model is None:
836
- raise RuntimeError("TTS model is not loaded. Cannot save audio.")
837
-
838
- audio_sample_rate = self.loaded_model.sample_rate
839
-
840
- # Convert tensor to numpy array for scipy
841
- audio_numpy_data = audio_tensor.numpy()
842
-
843
- # Create temporary file and write audio data
844
- output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
845
- scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)
846
-
847
- # Register file for cleanup tracking
848
- with temporary_files_lock:
849
- temporary_files_registry[output_file.name] = time.time()
850
-
851
- trigger_background_cleanup_check()
852
-
853
- return output_file.name
854
-
855
-
856
- # Create global TTS manager instance
857
- text_to_speech_manager = TextToSpeechManager()
858
-
859
-
860
- # =============================================================================
861
- # UTILITY FUNCTIONS
862
- # =============================================================================
863
-
864
- def cleanup_expired_temporary_files():
865
- """
866
- Remove temporary files that have exceeded their lifetime.
867
-
868
- This function is called periodically to prevent disk space exhaustion
869
- from accumulated temporary audio files. Files older than
870
- TEMPORARY_FILE_LIFETIME_SECONDS are removed from disk and registry.
871
- """
872
- current_timestamp = time.time()
873
- expired_files = []
874
-
875
- with temporary_files_lock:
876
- # Identify expired files
877
- for file_path, creation_timestamp in list(temporary_files_registry.items()):
878
- if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
879
- expired_files.append(file_path)
880
-
881
- # Remove expired files from disk and registry
882
- for file_path in expired_files:
883
- try:
884
- if os.path.exists(file_path):
885
- os.remove(file_path)
886
- del temporary_files_registry[file_path]
887
- except Exception:
888
- pass # Silently ignore deletion errors
889
 
890
-
891
- def validate_text_input(text_content):
892
- """
893
- Validate and clean text input for speech generation.
894
-
895
- Args:
896
- text_content: Raw text input from user
897
-
898
- Returns:
899
- tuple: (is_valid: bool, result: str)
900
- - If valid: (True, cleaned_text)
901
- - If invalid: (False, error_message or empty string)
902
- """
903
- # Check for None or non-string input
904
- if not text_content or not isinstance(text_content, str):
905
- return False, ""
906
-
907
- # Clean whitespace
908
- cleaned_text = text_content.strip()
909
-
910
- # Check for empty content
911
- if not cleaned_text:
912
- return False, ""
913
-
914
- # Check length constraint
915
- if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
916
- return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."
917
-
918
- return True, cleaned_text
919
-
920
-
921
- # =============================================================================
922
- # =============================================================================
923
-
924
- #
925
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
926
- # SPDX-License-Identifier: Apache-2.0
927
- #
928
-
929
- def check_if_generating():
930
- with generation_state_lock:
931
- return is_currently_generating
932
-
933
- # =============================================================================
934
- # =============================================================================
935
-
936
-
937
- def request_generation_stop():
938
- """
939
- Signal a request to stop the current generation.
940
-
941
- Returns:
942
- gr.update: Update to disable the stop button
943
- """
944
- global stop_generation_requested
945
- with generation_state_lock:
946
- stop_generation_requested = True
947
- return gr.update(interactive=False)
948
-
949
-
950
- # =============================================================================
951
- # SPEECH GENERATION FUNCTION
952
- # =============================================================================
953
-
954
- def perform_speech_generation(
955
- text_input,
956
- voice_mode_selection,
957
- voice_preset_selection,
958
- voice_clone_audio_file,
959
- model_variant,
960
- lsd_decode_steps,
961
- temperature,
962
- noise_clamp,
963
- eos_threshold,
964
- frames_after_eos,
965
- enable_custom_frames
966
- ):
967
- """
968
- Perform the complete speech generation workflow.
969
-
970
- This function orchestrates the entire generation process including:
971
- validation, model loading, voice state preparation, audio generation,
972
- and file saving. It handles thread safety and stop requests.
973
-
974
- Args:
975
- text_input: Text to convert to speech
976
- voice_mode_selection: "Preset Voices" or "Voice Cloning"
977
- voice_preset_selection: Selected preset voice name
978
- voice_clone_audio_file: Path to uploaded audio for cloning
979
- model_variant: Model variant identifier
980
- lsd_decode_steps: Number of LSD decode steps
981
- temperature: Generation temperature
982
- noise_clamp: Noise clamping value
983
- eos_threshold: End-of-sequence threshold
984
- frames_after_eos: Frames to generate after EOS
985
- enable_custom_frames: Whether to use custom frame count
986
-
987
- Returns:
988
- str or None: Path to generated audio file, or None if stopped
989
-
990
- Raises:
991
- gr.Error: On validation failure or generation error
992
- """
993
- global is_currently_generating, stop_generation_requested
994
-
995
- # Run cleanup before starting new generation
996
- if has_temporary_files_pending_cleanup():
997
- cleanup_expired_temporary_files()
998
-
999
- perform_memory_cleanup()
1000
-
1001
- # Validate text input
1002
- is_valid, validation_result = validate_text_input(text_input)
1003
-
1004
- if not is_valid:
1005
- if validation_result:
1006
- raise gr.Error(validation_result)
1007
- raise gr.Error("Please enter valid text to generate speech.")
1008
-
1009
- # Validate voice cloning audio if in clone mode
1010
- if voice_mode_selection == VOICE_MODE_CLONE:
1011
- if not voice_clone_audio_file:
1012
- raise gr.Error("Please upload an audio file for voice cloning.")
1013
- if not HF_TOKEN:
1014
- raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
1015
-
1016
- # Acquire generation lock
1017
- with generation_state_lock:
1018
- if is_currently_generating:
1019
- raise gr.Error("A generation is already in progress. Please wait.")
1020
- is_currently_generating = True
1021
- stop_generation_requested = False
1022
-
1023
- generated_audio_tensor = None
1024
- cloned_voice_state_tensor = None
1025
-
1026
- try:
1027
- # Load or retrieve cached model
1028
- text_to_speech_manager.load_or_get_model(
1029
- model_variant,
1030
- temperature,
1031
- lsd_decode_steps,
1032
- noise_clamp,
1033
- eos_threshold
1034
- )
1035
-
1036
- # Check for stop request after model loading
1037
- with generation_state_lock:
1038
- if stop_generation_requested:
1039
- return None
1040
-
1041
- # Prepare voice state based on mode
1042
- if voice_mode_selection == VOICE_MODE_CLONE:
1043
- cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
1044
- voice_state = cloned_voice_state_tensor
1045
-
1046
- else:
1047
- voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
1048
-
1049
- # Check for stop request after voice state preparation
1050
- with generation_state_lock:
1051
- if stop_generation_requested:
1052
- return None
1053
-
1054
- # Generate audio from text
1055
- generated_audio_tensor = text_to_speech_manager.generate_audio(
1056
- validation_result,
1057
- voice_state,
1058
- frames_after_eos,
1059
- enable_custom_frames
1060
- )
1061
-
1062
- # Check for stop request after generation
1063
- with generation_state_lock:
1064
- if stop_generation_requested:
1065
- return None
1066
-
1067
- # Save audio to temporary file
1068
- output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)
1069
-
1070
- return output_file_path
1071
-
1072
- except gr.Error:
1073
- raise
1074
-
1075
- except RuntimeError as runtime_error:
1076
- raise gr.Error(str(runtime_error))
1077
-
1078
- except Exception as generation_error:
1079
- raise gr.Error(f"Speech generation failed: {str(generation_error)}")
1080
-
1081
- finally:
1082
- # Always release generation lock
1083
- with generation_state_lock:
1084
- is_currently_generating = False
1085
- stop_generation_requested = False
1086
-
1087
- if generated_audio_tensor is not None:
1088
- del generated_audio_tensor
1089
- generated_audio_tensor = None
1090
-
1091
- if cloned_voice_state_tensor is not None:
1092
- del cloned_voice_state_tensor
1093
- cloned_voice_state_tensor = None
1094
-
1095
- memory_cleanup()
1096
-
1097
- trigger_background_cleanup_check()
1098
-
1099
-
1100
- # =============================================================================
1101
- # UI STATE MANAGEMENT FUNCTIONS
1102
- # =============================================================================
1103
-
1104
- def check_generate_button_state(text_content, ui_state):
1105
- """
1106
- Update generate button interactivity based on text validity and UI state.
1107
-
1108
- Args:
1109
- text_content: Current text input content
1110
- ui_state: Current UI state dictionary
1111
-
1112
- Returns:
1113
- gr.update: Update with interactive state
1114
- """
1115
-
1116
- if ui_state.get("generating", False):
1117
- return gr.update(interactive=False)
1118
-
1119
- is_valid, _ = validate_text_input(text_content)
1120
- return gr.update(interactive=is_valid)
1121
-
1122
-
1123
- def calculate_character_count_display(text_content):
1124
- """
1125
- Generate HTML for character count display with color coding.
1126
-
1127
- Args:
1128
- text_content: Current text input content
1129
-
1130
- Returns:
1131
- str: HTML string for character count display
1132
- """
1133
- character_count = len(text_content) if text_content else 0
1134
-
1135
- # Use error color if over limit
1136
- display_color = (
1137
- "var(--error-text-color)"
1138
- if character_count > MAXIMUM_INPUT_LENGTH
1139
- else "var(--body-text-color-subdued)"
1140
- )
1141
-
1142
- return f"<div style='text-align: right; padding: 4px 0;'><span style='color: {display_color}; font-size: 0.85em;'>{character_count} / {MAXIMUM_INPUT_LENGTH}</span></div>"
1143
-
1144
-
1145
- def determine_clear_button_visibility(text_content, audio_output, ui_state):
1146
- """
1147
- Determine clear button visibility based on content state and UI state.
1148
- Clear button is ALWAYS hidden during generation to prevent race conditions.
1149
-
1150
- Args:
1151
- text_content: Current text input content
1152
- audio_output: Current audio output value
1153
- ui_state: Current UI state dictionary
1154
-
1155
- Returns:
1156
- gr.update: Update with visibility state
1157
- """
1158
-
1159
- if ui_state.get("generating", False):
1160
- return gr.update(visible=False)
1161
-
1162
- has_text_content = bool(text_content and text_content.strip())
1163
- has_audio_output = audio_output is not None
1164
- should_show_clear = has_text_content or has_audio_output
1165
- return gr.update(visible=should_show_clear)
1166
-
1167
-
1168
- def update_voice_mode_visibility(voice_mode_value):
1169
- """
1170
- Update visibility of voice selection containers based on mode.
1171
-
1172
- Args:
1173
- voice_mode_value: Selected voice mode
1174
-
1175
- Returns:
1176
- tuple: (preset_container_update, clone_container_update)
1177
- """
1178
- if voice_mode_value == VOICE_MODE_CLONE:
1179
- return gr.update(visible=False), gr.update(visible=True)
1180
- else:
1181
- return gr.update(visible=True), gr.update(visible=False)
1182
-
1183
-
1184
- # =============================================================================
1185
- # =============================================================================
1186
-
1187
- #
1188
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
1189
- # SPDX-License-Identifier: Apache-2.0
1190
- #
1191
-
1192
- def switch_to_generating_state(ui_state):
1193
- new_state = {"generating": True}
1194
- return (
1195
- gr.update(visible=False),
1196
- gr.update(visible=True, interactive=True),
1197
- gr.update(visible=False),
1198
- new_state
1199
- )
1200
-
1201
- # =============================================================================
1202
- # =============================================================================
1203
-
1204
-
1205
- def switch_to_idle_state(text_content, audio_output, ui_state):
1206
- """
1207
- Switch UI back to idle state after generation.
1208
-
1209
- Args:
1210
- text_content: Current text input content
1211
- audio_output: Current audio output value
1212
- ui_state: Current UI state dictionary (will be updated to idle)
1213
-
1214
- Returns:
1215
- tuple: Updates for (generate_button, stop_button, clear_button, ui_state)
1216
- """
1217
- new_state = {"generating": False}
1218
-
1219
- has_text_content = bool(text_content and text_content.strip())
1220
- has_audio_output = audio_output is not None
1221
- should_show_clear = has_text_content or has_audio_output
1222
-
1223
- return (
1224
- gr.update(visible=True), # Show generate button
1225
- gr.update(visible=False), # Hide stop button
1226
- gr.update(visible=should_show_clear), # Show clear if content exists
1227
- new_state # Update state to idle
1228
- )
1229
-
1230
-
1231
- def perform_clear_action():
1232
- """
1233
- Clear all input and output fields.
1234
-
1235
- Returns:
1236
- tuple: Reset values for all clearable components
1237
- """
1238
- return (
1239
- "", # Clear text input
1240
- None, # Clear audio output
1241
- gr.update(visible=False), # Hide clear button
1242
- VOICE_MODE_PRESET, # Reset voice mode
1243
- DEFAULT_VOICE, # Reset voice preset
1244
- None # Clear clone audio
1245
- )
1246
-
1247
-
1248
- # =============================================================================
1249
- # EXAMPLE HANDLING FUNCTIONS
1250
- # =============================================================================
1251
-
1252
- def create_example_handler(example_text, example_voice):
1253
- """
1254
- Create a handler function for example button clicks.
1255
-
1256
- Args:
1257
- example_text: Example text to set
1258
- example_voice: Example voice to select
1259
-
1260
- Returns:
1261
- function: Handler that sets example values
1262
- """
1263
- def set_example_values():
1264
- return example_text, VOICE_MODE_PRESET, example_voice
1265
- return set_example_values
1266
-
1267
-
1268
- def format_example_button_label(example_text, example_voice, max_text_length=40):
1269
- """
1270
- Format example button label with voice and truncated text.
1271
-
1272
- Args:
1273
- example_text: Full example text
1274
- example_voice: Voice name
1275
- max_text_length: Maximum text length before truncation
1276
-
1277
- Returns:
1278
- str: Formatted button label
1279
- """
1280
- truncated_text = (
1281
- example_text[:max_text_length] + "..."
1282
- if len(example_text) > max_text_length
1283
- else example_text
1284
- )
1285
- return f"[{example_voice}] {truncated_text}"
1286
-
1287
-
1288
- start_background_cleanup_thread()
1289
-
1290
-
1291
- # =============================================================================
1292
- # GRADIO APPLICATION DEFINITION
1293
- # =============================================================================
1294
-
1295
- with gr.Blocks() as application:
1296
- ui_state = gr.State({"generating": False})
1297
-
1298
- # -------------------------------------------------------------------------
1299
- # SIDEBAR SECTION
1300
- # -------------------------------------------------------------------------
1301
- # Contains project information, description, and credits
1302
-
1303
- with gr.Sidebar():
1304
- gr.HTML(
1305
- """
1306
- <h1>Audio Generation Playground part of the
1307
- <a href="https://huggingface.co/spaces/hadadxyz/ai" target="_blank">
1308
- Demo Playground</a>, and the
1309
- <a href="https://huggingface.co/umint" target="_blank">
1310
- UltimaX Intelligence</a> project.</h1><br />
1311
-
1312
- This space runs the <b><a href="https://huggingface.co/kyutai/pocket-tts"
1313
- target="_blank">Pocket TTS</a></b> model from <b>Kyutai</b>.<br /><br />
1314
-
1315
- A lightweight text-to-speech (TTS) application designed to run
1316
- efficiently on CPUs. Forget about the hassle of using GPUs and
1317
- web APIs serving TTS models.<br /><br />
1318
-
1319
- Additionally, this space runs with a custom Docker image to
1320
- maximize the model's potential and has been optimized for the
1321
- limited scope of Hugging Face Spaces.<br /><br />
1322
-
1323
- ⚠️ This space was created entirely by the
1324
- <b><a href="https://huggingface.co/hadadrjt/JARVIS" target="_blank">
1325
- J.A.R.V.I.S.</a></b> model operating in autonomous agent mode.
1326
- All code was generated by AI without human review.<br /><br />
1327
-
1328
- This is an experimental space and is not part of production.
1329
- There may be minor bugs since the code was generated by AI.
1330
- However, none have been found so far.<br /><br />
1331
-
1332
- If you find a bug, please report it in the community tab.<br /><br />
1333
-
1334
- <b>Like this project? You can support me by buying a
1335
- <a href="https://ko-fi.com/hadad" target="_blank">coffee</a></b>
1336
- """
1337
- )
1338
-
1339
- # -------------------------------------------------------------------------
1340
- # AUDIO OUTPUT SECTION
1341
- # -------------------------------------------------------------------------
1342
-
1343
- audio_output_component = gr.Audio(
1344
- label="Generated Speech Output",
1345
- type="filepath",
1346
- interactive=False,
1347
- show_download_button=True
1348
- )
1349
-
1350
- # -------------------------------------------------------------------------
1351
- # VOICE SELECTION SECTION
1352
- # -------------------------------------------------------------------------
1353
-
1354
- with gr.Accordion("🎭 Voice Selection", open=True):
1355
- # Voice mode selector (preset vs cloning)
1356
- voice_mode_radio = gr.Radio(
1357
- label="Voice Mode",
1358
- choices=[VOICE_MODE_PRESET, VOICE_MODE_CLONE],
1359
- value=VOICE_MODE_PRESET,
1360
- info="Choose between preset voices or clone a voice from uploaded audio"
1361
- )
1362
-
1363
- # Container for preset voice selection
1364
- with gr.Column(visible=True) as preset_voice_container:
1365
- voice_preset_dropdown = gr.Dropdown(
1366
- label="Select Preset Voice",
1367
- choices=AVAILABLE_VOICES,
1368
- value=DEFAULT_VOICE
1369
- )
1370
-
1371
- # Container for voice cloning audio upload
1372
- with gr.Column(visible=False) as clone_voice_container:
1373
- voice_clone_audio_input = gr.Audio(
1374
- label="Upload Audio for Voice Cloning",
1375
- type="filepath"
1376
- )
1377
-
1378
- # -------------------------------------------------------------------------
1379
- # GENERATION PARAMETERS SECTION
1380
- # -------------------------------------------------------------------------
1381
-
1382
- with gr.Accordion("⚙️ Generation Parameters", open=False):
1383
- with gr.Row():
1384
- temperature_slider = gr.Slider(
1385
- label="Temperature",
1386
- minimum=0.1,
1387
- maximum=2.0,
1388
- step=0.05,
1389
- value=DEFAULT_TEMPERATURE,
1390
- info="Higher values produce more expressive speech"
1391
- )
1392
- lsd_decode_steps_slider = gr.Slider(
1393
- label="LSD Decode Steps",
1394
- minimum=1,
1395
- maximum=20,
1396
- step=1,
1397
- value=DEFAULT_LSD_DECODE_STEPS,
1398
- info="More steps may improve quality but slower"
1399
  )
1400
 
1401
- with gr.Row():
1402
- noise_clamp_slider = gr.Slider(
1403
- label="Noise Clamp",
1404
- minimum=0.0,
1405
- maximum=2.0,
1406
- step=0.05,
1407
- value=DEFAULT_NOISE_CLAMP,
1408
- info="Maximum noise sampling value (0 = disabled)"
1409
- )
1410
- eos_threshold_slider = gr.Slider(
1411
- label="End of Sequence Threshold",
1412
- minimum=-10.0,
1413
- maximum=0.0,
1414
- step=0.25,
1415
- value=DEFAULT_EOS_THRESHOLD,
1416
- info="Smaller values cause earlier completion"
1417
  )
1418
 
1419
- # -------------------------------------------------------------------------
1420
- # ADVANCED SETTINGS SECTION
1421
- # -------------------------------------------------------------------------
1422
-
1423
- with gr.Accordion("🔧 Advanced Settings", open=False):
1424
- model_variant_textbox = gr.Textbox(
1425
- label="Model Variant Identifier",
1426
- value=DEFAULT_MODEL_VARIANT,
1427
- info="Model signature for generation"
1428
- )
1429
-
1430
- with gr.Row():
1431
- enable_custom_frames_checkbox = gr.Checkbox(
1432
- label="Enable Custom Frames After EOS",
1433
- value=False,
1434
- info="Manually control post-EOS frame generation"
1435
- )
1436
- frames_after_eos_slider = gr.Slider(
1437
- label="Frames After EOS",
1438
- minimum=0,
1439
- maximum=100,
1440
- step=1,
1441
- value=DEFAULT_FRAMES_AFTER_EOS,
1442
- info="Additional frames after end-of-sequence (80ms per frame)"
1443
  )
1444
 
1445
- # -------------------------------------------------------------------------
1446
- # TEXT INPUT SECTION
1447
- # -------------------------------------------------------------------------
1448
-
1449
- text_input_component = gr.Textbox(
1450
- label="Prompt",
1451
- placeholder="Enter the text you want to convert to speech...",
1452
- lines=3,
1453
- max_lines=20,
1454
- max_length=MAXIMUM_INPUT_LENGTH,
1455
- autoscroll=True
1456
- )
1457
-
1458
- # Character count display
1459
- character_count_display = gr.HTML(
1460
- f"<div style='text-align: right; padding: 4px 0;'><span style='color: var(--body-text-color-subdued); font-size: 0.85em;'>0 / {MAXIMUM_INPUT_LENGTH}</span></div>"
1461
- )
1462
-
1463
- # -------------------------------------------------------------------------
1464
- # ACTION BUTTONS SECTION
1465
- # -------------------------------------------------------------------------
1466
-
1467
- # Primary generate button
1468
- generate_button = gr.Button(
1469
- "🎙️ Generate Speech",
1470
- variant="primary",
1471
- size="lg",
1472
- interactive=False
1473
- )
1474
-
1475
- # Stop button (visible during generation)
1476
- stop_button = gr.Button(
1477
- "⏹️ Stop Generation",
1478
- variant="stop",
1479
- size="lg",
1480
- visible=False
1481
- )
1482
-
1483
- # Clear button (visible when content exists)
1484
- clear_button = gr.Button(
1485
- "🗑️ Clear",
1486
- variant="secondary",
1487
- size="lg",
1488
- visible=False
1489
- )
1490
-
1491
- # -------------------------------------------------------------------------
1492
- # EXAMPLE PROMPTS SECTION
1493
- # -------------------------------------------------------------------------
1494
-
1495
- gr.HTML("""
1496
- <div style="padding: 16px 0 8px 0;">
1497
- <h3 style="margin: 0 0 8px 0; font-size: 1.1em;">💡 Example Prompts</h3>
1498
- <p style="margin: 0; opacity: 0.7; font-size: 0.9em;">Click any example to generate speech with its assigned voice</p>
1499
- </div>
1500
- """)
1501
-
1502
- # Create example buttons dynamically
1503
- example_buttons_list = []
1504
-
1505
- with gr.Row():
1506
- example_button_0 = gr.Button(
1507
- format_example_button_label(
1508
- EXAMPLE_PROMPTS_WITH_VOICES[0]["text"],
1509
- EXAMPLE_PROMPTS_WITH_VOICES[0]["voice"]
1510
- ),
1511
- size="sm",
1512
- variant="secondary"
1513
- )
1514
- example_buttons_list.append(example_button_0)
1515
-
1516
- example_button_1 = gr.Button(
1517
- format_example_button_label(
1518
- EXAMPLE_PROMPTS_WITH_VOICES[1]["text"],
1519
- EXAMPLE_PROMPTS_WITH_VOICES[1]["voice"]
1520
- ),
1521
- size="sm",
1522
- variant="secondary"
1523
- )
1524
- example_buttons_list.append(example_button_1)
1525
-
1526
- with gr.Row():
1527
- example_button_2 = gr.Button(
1528
- format_example_button_label(
1529
- EXAMPLE_PROMPTS_WITH_VOICES[2]["text"],
1530
- EXAMPLE_PROMPTS_WITH_VOICES[2]["voice"]
1531
- ),
1532
- size="sm",
1533
- variant="secondary"
1534
- )
1535
- example_buttons_list.append(example_button_2)
1536
-
1537
- example_button_3 = gr.Button(
1538
- format_example_button_label(
1539
- EXAMPLE_PROMPTS_WITH_VOICES[3]["text"],
1540
- EXAMPLE_PROMPTS_WITH_VOICES[3]["voice"]
1541
- ),
1542
- size="sm",
1543
- variant="secondary"
1544
- )
1545
- example_buttons_list.append(example_button_3)
1546
-
1547
- with gr.Row():
1548
- example_button_4 = gr.Button(
1549
- format_example_button_label(
1550
- EXAMPLE_PROMPTS_WITH_VOICES[4]["text"],
1551
- EXAMPLE_PROMPTS_WITH_VOICES[4]["voice"]
1552
- ),
1553
- size="sm",
1554
- variant="secondary"
1555
- )
1556
- example_buttons_list.append(example_button_4)
1557
 
1558
- # -------------------------------------------------------------------------
1559
- # EVENT HANDLERS AND BINDINGS
1560
- # -------------------------------------------------------------------------
1561
-
1562
- # Define input components list for generation function
1563
  generation_inputs = [
1564
  text_input_component,
1565
  voice_mode_radio,
@@ -1574,14 +236,15 @@ with gr.Blocks() as application:
1574
  enable_custom_frames_checkbox
1575
  ]
1576
 
1577
- # Voice mode change handler
1578
  voice_mode_radio.change(
1579
  fn=update_voice_mode_visibility,
1580
  inputs=[voice_mode_radio],
1581
- outputs=[preset_voice_container, clone_voice_container]
 
 
 
1582
  )
1583
 
1584
- # Text input change handlers
1585
  text_input_component.change(
1586
  fn=calculate_character_count_display,
1587
  inputs=[text_input_component],
@@ -1590,49 +253,54 @@ with gr.Blocks() as application:
1590
 
1591
  text_input_component.change(
1592
  fn=check_generate_button_state,
1593
- inputs=[text_input_component, ui_state],
 
 
 
1594
  outputs=[generate_button]
1595
  )
1596
 
1597
  text_input_component.change(
1598
  fn=determine_clear_button_visibility,
1599
- inputs=[text_input_component, audio_output_component, ui_state],
1600
- outputs=[clear_button]
1601
- )
1602
-
1603
- # Audio output change handler
1604
- audio_output_component.change(
1605
- fn=determine_clear_button_visibility,
1606
- inputs=[text_input_component, audio_output_component, ui_state],
1607
  outputs=[clear_button]
1608
  )
1609
 
1610
- # Generate button click handler chain
1611
  generate_button.click(
1612
  fn=switch_to_generating_state,
1613
  inputs=[ui_state],
1614
- outputs=[generate_button, stop_button, clear_button, ui_state]
 
 
 
 
 
1615
  ).then(
1616
  fn=perform_speech_generation,
1617
  inputs=generation_inputs,
1618
  outputs=[audio_output_component]
1619
  ).then(
1620
  fn=switch_to_idle_state,
1621
- inputs=[text_input_component, audio_output_component, ui_state],
1622
- outputs=[generate_button, stop_button, clear_button, ui_state]
1623
- ).then(
1624
- fn=check_generate_button_state,
1625
- inputs=[text_input_component, ui_state],
1626
- outputs=[generate_button]
 
 
 
 
1627
  )
1628
 
1629
- # Stop button handler
1630
  stop_button.click(
1631
  fn=request_generation_stop,
1632
  outputs=[stop_button]
1633
  )
1634
 
1635
- # Clear button handler
1636
  clear_button.click(
1637
  fn=perform_clear_action,
1638
  outputs=[
@@ -1645,39 +313,42 @@ with gr.Blocks() as application:
1645
  ]
1646
  )
1647
 
1648
- # Example button handlers
1649
  for button_index, example_button in enumerate(example_buttons_list):
1650
- example_text = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["text"]
1651
- example_voice = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["voice"]
1652
 
1653
  example_button.click(
1654
- fn=create_example_handler(example_text, example_voice),
1655
- outputs=[text_input_component, voice_mode_radio, voice_preset_dropdown]
1656
- ).then(
1657
  fn=switch_to_generating_state,
1658
  inputs=[ui_state],
1659
- outputs=[generate_button, stop_button, clear_button, ui_state]
 
 
 
 
 
 
 
 
 
 
 
 
1660
  ).then(
1661
  fn=perform_speech_generation,
1662
  inputs=generation_inputs,
1663
  outputs=[audio_output_component]
1664
  ).then(
1665
  fn=switch_to_idle_state,
1666
- inputs=[text_input_component, audio_output_component, ui_state],
1667
- outputs=[generate_button, stop_button, clear_button, ui_state]
1668
- ).then(
1669
- fn=check_generate_button_state,
1670
- inputs=[text_input_component, ui_state],
1671
- outputs=[generate_button]
 
 
 
 
1672
  )
1673
 
1674
-
1675
- # =============================================================================
1676
- # APPLICATION ENTRY POINT
1677
- # =============================================================================
1678
-
1679
- if __name__ == "__main__":
1680
- application.launch(
1681
- server_name="0.0.0.0",
1682
- share=False
1683
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #
2
  # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
+ import math
7
+ import torch
8
+ import gradio as gr
9
+ torch.set_num_threads(1)
10
+ torch.set_num_interop_threads(1)
11
+ from config import (
12
+ AVAILABLE_VOICES,
13
+ DEFAULT_VOICE,
14
+ DEFAULT_MODEL_VARIANT,
15
+ DEFAULT_TEMPERATURE,
16
+ DEFAULT_LSD_DECODE_STEPS,
17
+ DEFAULT_EOS_THRESHOLD,
18
+ DEFAULT_NOISE_CLAMP,
19
+ DEFAULT_FRAMES_AFTER_EOS,
20
+ MAXIMUM_INPUT_LENGTH,
21
+ VOICE_MODE_PRESET,
22
+ VOICE_MODE_CLONE,
23
+ EXAMPLE_PROMPTS
24
+ )
25
+ from src.core.authentication import authenticate_huggingface
26
+ authenticate_huggingface()
27
+ from src.core.memory import start_background_cleanup_thread
28
+ start_background_cleanup_thread()
29
+ from src.generation.handler import (
30
+ perform_speech_generation,
31
+ request_generation_stop
32
+ )
33
+ from src.ui.state import (
34
+ check_generate_button_state,
35
+ calculate_character_count_display,
36
+ determine_clear_button_visibility,
37
+ update_voice_mode_visibility
38
+ )
39
+ from src.ui.handlers import (
40
+ switch_to_generating_state,
41
+ switch_to_idle_state,
42
+ perform_clear_action,
43
+ create_example_handler,
44
+ format_example_button_label
45
+ )
46
+ from assets.css.styles import CSS
47
+ from assets.static.title import TITLE
48
+ from assets.static.header import HEADER
49
+ from assets.static.footer import FOOTER
50
+ from assets.static.sidebar import SIDEBAR
51
+
52
+ with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app:
53
+ ui_state = gr.State({"generating": False})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ with gr.Sidebar():
56
+ gr.HTML(SIDEBAR())
 
57
 
58
+ with gr.Column(elem_classes="header-section"):
59
+ gr.HTML(TITLE())
60
+ gr.HTML(HEADER())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ with gr.Row():
63
+ with gr.Column():
64
+ audio_output_component = gr.Audio(
65
+ label="Generated Speech Output",
66
+ type="filepath",
67
+ interactive=False,
68
+ autoplay=False
69
+ )
 
 
 
70
 
71
+ with gr.Accordion("Voice Selection", open=True):
72
+ voice_mode_radio = gr.Radio(
73
+ label="Voice Mode",
74
+ choices=[
75
+ VOICE_MODE_PRESET,
76
+ VOICE_MODE_CLONE
77
+ ],
78
+ value=VOICE_MODE_PRESET,
79
+ info="Choose between preset voices or clone a voice from uploaded audio",
80
+ elem_id="voice-mode"
81
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ with gr.Column(visible=True) as preset_voice_container:
84
+ voice_preset_dropdown = gr.Dropdown(
85
+ label="Select Preset Voice",
86
+ choices=AVAILABLE_VOICES,
87
+ value=DEFAULT_VOICE
88
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ with gr.Column(visible=False) as clone_voice_container:
91
+ voice_clone_audio_input = gr.Audio(
92
+ label="Upload Audio for Voice Cloning",
93
+ type="filepath"
94
+ )
95
 
96
+ with gr.Accordion("Model Parameters", open=False):
97
+ with gr.Row():
98
+ temperature_slider = gr.Slider(
99
+ label="Temperature",
100
+ minimum=0.1,
101
+ maximum=2.0,
102
+ step=0.05,
103
+ value=DEFAULT_TEMPERATURE,
104
+ info="Higher values produce more expressive speech"
105
+ )
106
+
107
+ lsd_decode_steps_slider = gr.Slider(
108
+ label="LSD Decode Steps",
109
+ minimum=1,
110
+ maximum=20,
111
+ step=1,
112
+ value=DEFAULT_LSD_DECODE_STEPS,
113
+ info="More steps may improve quality but slower"
114
+ )
115
 
116
+ with gr.Row():
117
+ noise_clamp_slider = gr.Slider(
118
+ label="Noise Clamp",
119
+ minimum=0.0,
120
+ maximum=2.0,
121
+ step=0.05,
122
+ value=DEFAULT_NOISE_CLAMP,
123
+ info="Maximum noise sampling value (0 = disabled)"
124
+ )
125
+
126
+ eos_threshold_slider = gr.Slider(
127
+ label="End of Sequence Threshold",
128
+ minimum=-10.0,
129
+ maximum=0.0,
130
+ step=0.25,
131
+ value=DEFAULT_EOS_THRESHOLD,
132
+ info="Smaller values cause earlier completion"
133
+ )
134
 
135
+ with gr.Accordion("Advanced Settings", open=False):
136
+ model_variant_textbox = gr.Textbox(
137
+ label="Model Variant Identifier",
138
+ value=DEFAULT_MODEL_VARIANT,
139
+ info="Model signature for generation"
140
+ )
141
 
142
+ with gr.Row():
143
+ enable_custom_frames_checkbox = gr.Checkbox(
144
+ label="Enable Custom Frames After EOS",
145
+ value=False,
146
+ info="Manually control post-EOS frame generation"
147
  )
148
 
149
+ frames_after_eos_slider = gr.Slider(
150
+ label="Frames After EOS",
151
+ minimum=0,
152
+ maximum=100,
153
+ step=1,
154
+ value=DEFAULT_FRAMES_AFTER_EOS,
155
+ info="Additional frames after end-of-sequence (80ms per frame)"
156
+ )
 
 
 
 
157
 
158
+ with gr.Column(scale=1):
159
+ text_input_component = gr.Textbox(
160
+ label="Prompt",
161
+ placeholder="Enter the text you want to convert to speech...",
162
+ lines=2,
163
+ max_lines=20,
164
+ max_length=MAXIMUM_INPUT_LENGTH,
165
+ autoscroll=True
166
  )
167
 
168
+ character_count_display = gr.HTML(
169
+ f"<div style='text-align: right; padding: 4px 0;'><span style='color: var(--body-text-color-subdued); font-size: 0.85em;'>0 / {MAXIMUM_INPUT_LENGTH}</span></div>",
170
+ visible=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ generate_button = gr.Button(
174
+ "Generate",
175
+ variant="primary",
176
+ size="lg",
177
+ interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ stop_button = gr.Button(
181
+ "Stop",
182
+ variant="stop",
183
+ size="lg",
184
+ visible=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  )
186
 
187
+ clear_button = gr.Button(
188
+ "Clear",
189
+ variant="secondary",
190
+ size="lg",
191
+ visible=False
 
 
 
 
 
 
 
 
 
 
 
192
  )
193
 
194
+ gr.HTML(
195
+ """
196
+ <div style="padding: 16px 0 8px 0;">
197
+ <h3 style="margin: 0 0 8px 0; font-size: 1.1em;">Example Prompts</h3>
198
+ <p style="margin: 0; opacity: 0.7; font-size: 0.9em;">Click any example to generate speech with its assigned voice</p>
199
+ </div>
200
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  )
202
 
203
+ example_buttons_list = []
204
+ num_examples = len(EXAMPLE_PROMPTS)
205
+ examples_per_row = 2
206
+ num_rows = math.ceil(num_examples / examples_per_row)
207
+
208
+ for row_idx in range(num_rows):
209
+ with gr.Row():
210
+ start_idx = row_idx * examples_per_row
211
+ end_idx = min(start_idx + examples_per_row, num_examples)
212
+ for i in range(start_idx, end_idx):
213
+ btn = gr.Button(
214
+ format_example_button_label(
215
+ EXAMPLE_PROMPTS[i]["text"],
216
+ EXAMPLE_PROMPTS[i]["voice"]
217
+ ),
218
+ size="sm",
219
+ variant="secondary"
220
+ )
221
+ example_buttons_list.append(btn)
222
+
223
+ gr.HTML(FOOTER())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
 
 
 
 
 
225
  generation_inputs = [
226
  text_input_component,
227
  voice_mode_radio,
 
236
  enable_custom_frames_checkbox
237
  ]
238
 
 
239
  voice_mode_radio.change(
240
  fn=update_voice_mode_visibility,
241
  inputs=[voice_mode_radio],
242
+ outputs=[
243
+ preset_voice_container,
244
+ clone_voice_container
245
+ ]
246
  )
247
 
 
248
  text_input_component.change(
249
  fn=calculate_character_count_display,
250
  inputs=[text_input_component],
 
253
 
254
  text_input_component.change(
255
  fn=check_generate_button_state,
256
+ inputs=[
257
+ text_input_component,
258
+ ui_state
259
+ ],
260
  outputs=[generate_button]
261
  )
262
 
263
  text_input_component.change(
264
  fn=determine_clear_button_visibility,
265
+ inputs=[
266
+ text_input_component,
267
+ ui_state
268
+ ],
 
 
 
 
269
  outputs=[clear_button]
270
  )
271
 
 
272
  generate_button.click(
273
  fn=switch_to_generating_state,
274
  inputs=[ui_state],
275
+ outputs=[
276
+ generate_button,
277
+ stop_button,
278
+ clear_button,
279
+ ui_state
280
+ ]
281
  ).then(
282
  fn=perform_speech_generation,
283
  inputs=generation_inputs,
284
  outputs=[audio_output_component]
285
  ).then(
286
  fn=switch_to_idle_state,
287
+ inputs=[
288
+ text_input_component,
289
+ ui_state
290
+ ],
291
+ outputs=[
292
+ generate_button,
293
+ stop_button,
294
+ clear_button,
295
+ ui_state
296
+ ]
297
  )
298
 
 
299
  stop_button.click(
300
  fn=request_generation_stop,
301
  outputs=[stop_button]
302
  )
303
 
 
304
  clear_button.click(
305
  fn=perform_clear_action,
306
  outputs=[
 
313
  ]
314
  )
315
 
 
316
  for button_index, example_button in enumerate(example_buttons_list):
317
+ example_text = EXAMPLE_PROMPTS[button_index]["text"]
318
+ example_voice = EXAMPLE_PROMPTS[button_index]["voice"]
319
 
320
  example_button.click(
 
 
 
321
  fn=switch_to_generating_state,
322
  inputs=[ui_state],
323
+ outputs=[
324
+ generate_button,
325
+ stop_button,
326
+ clear_button,
327
+ ui_state
328
+ ]
329
+ ).then(
330
+ fn=create_example_handler(example_text, example_voice),
331
+ outputs=[
332
+ text_input_component,
333
+ voice_mode_radio,
334
+ voice_preset_dropdown
335
+ ]
336
  ).then(
337
  fn=perform_speech_generation,
338
  inputs=generation_inputs,
339
  outputs=[audio_output_component]
340
  ).then(
341
  fn=switch_to_idle_state,
342
+ inputs=[
343
+ text_input_component,
344
+ ui_state
345
+ ],
346
+ outputs=[
347
+ generate_button,
348
+ stop_button,
349
+ clear_button,
350
+ ui_state
351
+ ]
352
  )
353
 
354
+ app.launch(server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
assets/css/styles.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # https://huggingface.co/spaces/D3vShoaib/pocket-tts
3
+ #
4
+
5
+ CSS = """
6
+ footer {
7
+ visibility: hidden
8
+ }
9
+
10
+ .gradio-container {
11
+ max-width: 100% !important;
12
+ padding: 0 !important;
13
+ }
14
+
15
+ @media (min-width: 768px) {
16
+ .gradio-container {
17
+ padding-left: 2% !important;
18
+ padding-right: 2% !important;
19
+ }
20
+ }
21
+
22
+ .header-section {
23
+ text-align: left;
24
+ margin-bottom: 1.5rem;
25
+ }
26
+
27
+ .main-title {
28
+ color: #10b981;
29
+ font-weight: 800;
30
+ font-size: 1.8rem;
31
+ margin: 5px 0;
32
+ }
33
+
34
+ @media (min-width: 768px) {
35
+ .main-title {
36
+ font-size: 2.2rem;
37
+ }
38
+ }
39
+
40
+ .logo-container {
41
+ display: flex;
42
+ justify-content: flex-start;
43
+ align-items: center;
44
+ gap: 10px;
45
+ margin-bottom: 0;
46
+ }
47
+
48
+ .logo-img {
49
+ height: 40px;
50
+ border-radius: 8px;
51
+ }
52
+
53
+ @media (min-width: 768px) {
54
+ .logo-img {
55
+ height: 50px;
56
+ }
57
+
58
+ .logo-container {
59
+ gap: 15px;
60
+ }
61
+ }
62
+
63
+ .links-row {
64
+ display: flex;
65
+ flex-wrap: wrap;
66
+ justify-content: flex-start;
67
+ gap: 8px;
68
+ margin: 5px 0 10px 0;
69
+ font-size: 0.85rem;
70
+ }
71
+
72
+ @media (min-width: 768px) {
73
+ .links-row {
74
+ gap: 10px;
75
+ font-size: 0.9rem;
76
+ }
77
+ }
78
+
79
+ .links-row a {
80
+ color: #10b981;
81
+ text-decoration: none;
82
+ padding: 3px 10px;
83
+ border: 1px solid #10b981;
84
+ border-radius: 15px;
85
+ transition: all 0.2s;
86
+ white-space: nowrap;
87
+ }
88
+
89
+ .links-row a:hover {
90
+ background-color: #10b981;
91
+ color: white;
92
+ }
93
+
94
+ .disclaimer {
95
+ text-align: center;
96
+ font-size: 0.8rem;
97
+ color: #9ca3af;
98
+ margin-top: 30px;
99
+ padding: 15px;
100
+ border-top: 1px solid currentColor;
101
+ }
102
+
103
+ @media (min-width: 768px) {
104
+ .disclaimer {
105
+ margin-top: 40px;
106
+ padding: 20px;
107
+ }
108
+ }
109
+
110
+ #voice-mode .wrap {
111
+ display: flex !important;
112
+ flex-direction: row !important;
113
+ width: 100% !important;
114
+ }
115
+
116
+ #voice-mode .wrap label {
117
+ flex: 1 !important;
118
+ justify-content: center !important;
119
+ text-align: center !important;
120
+ }
121
+ """
assets/static/footer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ from config import (
7
+ COPYRIGHT_NAME,
8
+ COPYRIGHT_URL,
9
+ DESIGN_BY_NAME,
10
+ DESIGN_BY_URL
11
+ )
12
+
13
+ def FOOTER():
14
+ return f"""
15
+ <div class="disclaimer" style="font-size: 10px; line-height: 1.4;">
16
+ <br>
17
+ <p style="opacity: 0.8;">
18
+ Copyright © 2026
19
+ <a href="{COPYRIGHT_URL}" target="_blank"
20
+ target="_blank" style="color: #10b981; text-decoration: none;">
21
+ {COPYRIGHT_NAME}
22
+ </a>,
23
+ design inspired by
24
+ <a href="{DESIGN_BY_URL}" target="_blank"
25
+ target="_blank" style="color: #10b981; text-decoration: none;">
26
+ {DESIGN_BY_NAME}
27
+ </a>.
28
+ </p>
29
+
30
+ <p style="font-size: 8px; opacity: 0.7;">
31
+ ⚠️ This Space is not affiliated with Kyutai TTS and is provided for demonstration purposes only.
32
+ </p>
33
+ </div>
34
+ """
assets/static/header.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ from config import HEADER_LINKS
7
+
8
+ def HEADER():
9
+ data = ""
10
+
11
+ for link in HEADER_LINKS:
12
+ data += f'<a href="{link["url"]}" target="_blank">{link["icon"]} {link["text"]}</a>\n'
13
+
14
+ return f"""
15
+ <div class="links-row">
16
+ {data}
17
+ </div>
18
+ """
assets/static/sidebar.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ def SIDEBAR():
7
+ return f"""
8
+ <h1>
9
+ Audio Generation Playground part of the
10
+ <a href="https://huggingface.co/spaces/hadadxyz/ai" target="_blank"
11
+ target="_blank" style="color: #10b981; text-decoration: none;">
12
+ Demo Playground
13
+ </a>,
14
+ and the
15
+ <a href="https://huggingface.co/umint" target="_blank"
16
+ target="_blank" style="color: #10b981; text-decoration: none;">
17
+ UltimaX Intelligence
18
+ </a>
19
+ project.
20
+ </h1><br />
21
+
22
+ <p>
23
+ This Space runs the
24
+ <b>
25
+ <a href="https://huggingface.co/kyutai/pocket-tts"
26
+ target="_blank" target="_blank" style="color: #10b981; text-decoration: none;">
27
+ Pocket TTS
28
+ </a>
29
+ </b>
30
+ model from <b>Kyutai</b>.<br /><br />
31
+
32
+ A lightweight text-to-speech (TTS) application designed to run
33
+ efficiently on CPUs. Forget about the hassle of using GPUs and
34
+ web APIs serving TTS models.<br /><br />
35
+
36
+ Additionally, this Space uses a custom Docker image to
37
+ maximize model performance and is optimized for the
38
+ constraints of Hugging Face Spaces.
39
+ </p><br />
40
+
41
+ <p>
42
+ <b>Like this project?</b> You can support me by buying a
43
+ <a href="https://ko-fi.com/hadad" target="_blank"
44
+ target="_blank" style="color: #10b981; text-decoration: none;">
45
+ coffee
46
+ </a>.
47
+ </p>
48
+ """
assets/static/title.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ from config import KYUTAI_LOGO_URL, POCKET_TTS_LOGO_URL, SPACE_INFO
7
+
8
+ def TITLE():
9
+ return f"""
10
+ <div class="logo-container">
11
+ <img src="{KYUTAI_LOGO_URL}" class="logo-img" alt="Kyutai Logo">
12
+ <img src="{POCKET_TTS_LOGO_URL}" class="logo-img" alt="PocketTTS Logo">
13
+ <h1 class='main-title'>{SPACE_INFO}</h1>
14
+ </div>
15
+ """
config.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import os
7
+
8
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
9
+
10
+ AVAILABLE_VOICES = [
11
+ "alba",
12
+ "marius",
13
+ "javert",
14
+ "jean",
15
+ "fantine",
16
+ "cosette",
17
+ "eponine",
18
+ "azelma"
19
+ ]
20
+
21
+ DEFAULT_VOICE = "alba"
22
+ DEFAULT_MODEL_VARIANT = "b6369a24"
23
+ DEFAULT_TEMPERATURE = 0.7
24
+ DEFAULT_LSD_DECODE_STEPS = 1
25
+ DEFAULT_EOS_THRESHOLD = -4.0
26
+ DEFAULT_NOISE_CLAMP = 0.0
27
+ DEFAULT_FRAMES_AFTER_EOS = 10
28
+
29
+ VOICE_MODE_PRESET = "Preset Voices"
30
+ VOICE_MODE_CLONE = "Voice Cloning"
31
+
32
+ VOICE_STATE_CACHE_MAXIMUM_SIZE = 8
33
+ VOICE_STATE_CACHE_CLEANUP_THRESHOLD = 4
34
+
35
+ BACKGROUND_CLEANUP_INTERVAL = 300
36
+
37
+ MAXIMUM_INPUT_LENGTH = 1000
38
+
39
+ TEMPORARY_FILE_LIFETIME_SECONDS = 7200
40
+
41
+ MAXIMUM_MEMORY_USAGE = 1 * 1024 * 1024 * 1024
42
+ MEMORY_WARNING_THRESHOLD = int(0.7 * MAXIMUM_MEMORY_USAGE)
43
+ MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
44
+ MEMORY_CHECK_INTERVAL = 30
45
+ MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
46
+
47
+ EXAMPLE_PROMPTS = [
48
+ {
49
+ "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
50
+ "voice": "alba"
51
+ },
52
+ {
53
+ "text": "Welcome to the future of text to speech technology powered by artificial intelligence.",
54
+ "voice": "marius"
55
+ },
56
+ {
57
+ "text": "Technology continues to push the boundaries of what we thought was possible.",
58
+ "voice": "javert"
59
+ },
60
+ {
61
+ "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.",
62
+ "voice": "fantine"
63
+ },
64
+ {
65
+ "text": "Science and innovation are transforming how we interact with the world around us.",
66
+ "voice": "jean"
67
+ }
68
+ ]
69
+
70
+ KYUTAI_LOGO_URL = "https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg"
71
+ POCKET_TTS_LOGO_URL = "https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png"
72
+
73
+ SPACE_INFO = "Pocket TTS"
74
+
75
+ HEADER_LINKS = [
76
+ {"icon": "🔊", "text": "Demo", "url": "https://kyutai.org/tts"},
77
+ {"icon": "🐱‍💻", "text": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"},
78
+ {"icon": "🤗", "text": "Model Card", "url": "https://huggingface.co/kyutai/pocket-tts"},
79
+ {"icon": "🤗", "text": "Space", "url": "https://huggingface.co/spaces/hadadxyz/pocket-tts-hf-cpu-optimized"},
80
+ {"icon": "📄", "text": "Paper", "url": "https://arxiv.org/abs/2509.06926"},
81
+ {"icon": "📚", "text": "Docs", "url": "https://github.com/kyutai-labs/pocket-tts/tree/main/docs"},
82
+ ]
83
+
84
+ COPYRIGHT_NAME = "Hadad Darajat"
85
+ COPYRIGHT_URL = "https://www.linkedin.com/in/hadadrjt"
86
+
87
+ DESIGN_BY_NAME = "D3vShoaib/pocket-tts"
88
+ DESIGN_BY_URL = f"https://huggingface.co/spaces/{DESIGN_BY_NAME}"
src/audio/converter.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import time
7
+ import tempfile
8
+ import numpy as np
9
+ import scipy.io.wavfile
10
+ from ..core.state import temporary_files_registry, temporary_files_lock
11
+ from ..core.memory import trigger_background_cleanup_check
12
+
13
+ def convert_audio_to_pcm_wav(input_path):
14
+ try:
15
+ sample_rate, audio_data = scipy.io.wavfile.read(input_path)
16
+
17
+ if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
18
+ audio_data = np.clip(audio_data, -1.0, 1.0)
19
+ audio_data = (audio_data * 32767).astype(np.int16)
20
+
21
+ elif audio_data.dtype == np.int32:
22
+ audio_data = (audio_data >> 16).astype(np.int16)
23
+
24
+ elif audio_data.dtype == np.uint8:
25
+ audio_data = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
26
+
27
+ elif audio_data.dtype != np.int16:
28
+ audio_data = audio_data.astype(np.int16)
29
+
30
+ output_file = tempfile.NamedTemporaryFile(suffix="_converted.wav", delete=False)
31
+ scipy.io.wavfile.write(output_file.name, sample_rate, audio_data)
32
+
33
+ with temporary_files_lock:
34
+ temporary_files_registry[output_file.name] = time.time()
35
+
36
+ trigger_background_cleanup_check()
37
+
38
+ return output_file.name
39
+
40
+ except Exception as conversion_error:
41
+ print(f"Warning: {conversion_error}")
42
+ return input_path
src/core/authentication.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ from config import HF_TOKEN
7
+ from huggingface_hub import login
8
+
9
+ def authenticate_huggingface():
10
+ if HF_TOKEN:
11
+ try:
12
+ login(token=HF_TOKEN, add_to_git_credential=False)
13
+ print("Authenticated with Hugging Face")
14
+
15
+ except Exception as authentication_error:
16
+ print(f"Hugging Face authentication failed: {authentication_error}")
17
+ print("Voice cloning may not be available")
18
+
19
+ else:
20
+ print("Missing Hugging Face authentication required for the license agreement")
21
+
22
+ def get_huggingface_token():
23
+ return HF_TOKEN
src/core/memory.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import os
7
+ import gc
8
+ import time
9
+ import atexit
10
+ import threading
11
+ import torch
12
+ from config import (
13
+ TEMPORARY_FILE_LIFETIME_SECONDS,
14
+ BACKGROUND_CLEANUP_INTERVAL,
15
+ MEMORY_WARNING_THRESHOLD,
16
+ MEMORY_CRITICAL_THRESHOLD,
17
+ MEMORY_CHECK_INTERVAL,
18
+ MEMORY_IDLE_TARGET,
19
+ MAXIMUM_MEMORY_USAGE
20
+ )
21
+ from ..core.state import (
22
+ temporary_files_registry,
23
+ temporary_files_lock,
24
+ memory_enforcement_lock,
25
+ background_cleanup_thread,
26
+ background_cleanup_stop_event,
27
+ background_cleanup_trigger_event,
28
+ check_if_generation_is_currently_active,
29
+ get_text_to_speech_manager
30
+ )
31
+
32
+ def get_current_memory_usage():
33
+ try:
34
+ with open('/proc/self/status', 'r') as status_file:
35
+ for line in status_file:
36
+ if line.startswith('VmRSS:'):
37
+ memory_value_kb = int(line.split()[1])
38
+ return memory_value_kb * 1024
39
+
40
+ except Exception:
41
+ pass
42
+
43
+ try:
44
+ with open('/proc/self/statm', 'r') as statm_file:
45
+ statm_values = statm_file.read().split()
46
+ resident_pages = int(statm_values[1])
47
+ page_size = os.sysconf('SC_PAGE_SIZE')
48
+ return resident_pages * page_size
49
+
50
+ except Exception:
51
+ pass
52
+
53
+ try:
54
+ import resource
55
+ import platform
56
+ memory_usage_kilobytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
57
+
58
+ if platform.system() == "Darwin":
59
+ return memory_usage_kilobytes
60
+ else:
61
+ return memory_usage_kilobytes * 1024
62
+
63
+ except Exception:
64
+ pass
65
+
66
+ return 0
67
+
68
+ def is_memory_usage_within_limit():
69
+ current_memory_usage = get_current_memory_usage()
70
+ return current_memory_usage < MAXIMUM_MEMORY_USAGE
71
+
72
+ def is_memory_usage_approaching_limit():
73
+ current_memory_usage = get_current_memory_usage()
74
+ return current_memory_usage >= MEMORY_WARNING_THRESHOLD
75
+
76
+ def is_memory_usage_critical():
77
+ current_memory_usage = get_current_memory_usage()
78
+ return current_memory_usage >= MEMORY_CRITICAL_THRESHOLD
79
+
80
+ def is_memory_above_idle_target():
81
+ current_memory_usage = get_current_memory_usage()
82
+ return current_memory_usage > MEMORY_IDLE_TARGET
83
+
84
+ def force_garbage_collection():
85
+ gc.collect(0)
86
+ gc.collect(1)
87
+ gc.collect(2)
88
+
89
+ if torch.cuda.is_available():
90
+ torch.cuda.empty_cache()
91
+ torch.cuda.synchronize()
92
+
93
+ def memory_cleanup():
94
+ force_garbage_collection()
95
+
96
+ try:
97
+ import ctypes
98
+ libc = ctypes.CDLL("libc.so.6")
99
+ libc.malloc_trim(0)
100
+
101
+ except Exception:
102
+ pass
103
+
104
+ force_garbage_collection()
105
+
106
+ def perform_memory_cleanup():
107
+ force_garbage_collection()
108
+
109
+ tts_manager = get_text_to_speech_manager()
110
+ if tts_manager is not None:
111
+ tts_manager.evict_least_recently_used_voice_states()
112
+
113
+ memory_cleanup()
114
+
115
+ def cleanup_expired_temporary_files():
116
+ current_timestamp = time.time()
117
+ expired_files = []
118
+
119
+ with temporary_files_lock:
120
+ for file_path, creation_timestamp in list(temporary_files_registry.items()):
121
+ if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
122
+ expired_files.append(file_path)
123
+
124
+ for file_path in expired_files:
125
+ try:
126
+ if os.path.exists(file_path):
127
+ os.remove(file_path)
128
+ del temporary_files_registry[file_path]
129
+
130
+ except Exception:
131
+ pass
132
+
133
+ def cleanup_all_temporary_files_immediately():
134
+ with temporary_files_lock:
135
+ for file_path in list(temporary_files_registry.keys()):
136
+ try:
137
+ if os.path.exists(file_path):
138
+ os.remove(file_path)
139
+ del temporary_files_registry[file_path]
140
+
141
+ except Exception:
142
+ pass
143
+
144
+ def has_temporary_files_pending_cleanup():
145
+ with temporary_files_lock:
146
+ if len(temporary_files_registry) == 0:
147
+ return False
148
+
149
+ current_timestamp = time.time()
150
+
151
+ for file_path, creation_timestamp in temporary_files_registry.items():
152
+ if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
153
+ return True
154
+
155
+ return False
156
+
157
+ def has_any_temporary_files_registered():
158
+ with temporary_files_lock:
159
+ return len(temporary_files_registry) > 0
160
+
161
+ def calculate_time_until_next_file_expiration():
162
+ with temporary_files_lock:
163
+ if len(temporary_files_registry) == 0:
164
+ return None
165
+
166
+ current_timestamp = time.time()
167
+ minimum_time_until_expiration = None
168
+
169
+ for file_path, creation_timestamp in temporary_files_registry.items():
170
+ time_since_creation = current_timestamp - creation_timestamp
171
+ time_until_expiration = TEMPORARY_FILE_LIFETIME_SECONDS - time_since_creation
172
+
173
+ if time_until_expiration <= 0:
174
+ return 0
175
+
176
+ if minimum_time_until_expiration is None or time_until_expiration < minimum_time_until_expiration:
177
+ minimum_time_until_expiration = time_until_expiration
178
+
179
+ return minimum_time_until_expiration
180
+
181
+ def enforce_memory_limit_if_exceeded():
182
+ with memory_enforcement_lock:
183
+ generation_is_active = check_if_generation_is_currently_active()
184
+
185
+ current_memory_usage = get_current_memory_usage()
186
+
187
+ if current_memory_usage < MEMORY_WARNING_THRESHOLD:
188
+ return True
189
+
190
+ force_garbage_collection()
191
+ current_memory_usage = get_current_memory_usage()
192
+
193
+ if current_memory_usage < MEMORY_WARNING_THRESHOLD:
194
+ return True
195
+
196
+ tts_manager = get_text_to_speech_manager()
197
+ if tts_manager is not None:
198
+ tts_manager.evict_least_recently_used_voice_states()
199
+
200
+ memory_cleanup()
201
+ current_memory_usage = get_current_memory_usage()
202
+
203
+ if current_memory_usage < MEMORY_CRITICAL_THRESHOLD:
204
+ return True
205
+
206
+ if tts_manager is not None:
207
+ tts_manager.clear_voice_state_cache_completely()
208
+
209
+ cleanup_all_temporary_files_immediately()
210
+ memory_cleanup()
211
+ current_memory_usage = get_current_memory_usage()
212
+
213
+ if current_memory_usage < MAXIMUM_MEMORY_USAGE:
214
+ return True
215
+
216
+ if generation_is_active:
217
+ return current_memory_usage < MAXIMUM_MEMORY_USAGE
218
+
219
+ if tts_manager is not None:
220
+ tts_manager.unload_model_completely()
221
+
222
+ memory_cleanup()
223
+ current_memory_usage = get_current_memory_usage()
224
+
225
+ return current_memory_usage < MAXIMUM_MEMORY_USAGE
226
+
227
+ def perform_idle_memory_reduction():
228
+ if check_if_generation_is_currently_active():
229
+ return
230
+
231
+ with memory_enforcement_lock:
232
+ current_memory_usage = get_current_memory_usage()
233
+
234
+ if current_memory_usage <= MEMORY_IDLE_TARGET:
235
+ return
236
+
237
+ force_garbage_collection()
238
+ current_memory_usage = get_current_memory_usage()
239
+
240
+ if current_memory_usage <= MEMORY_IDLE_TARGET:
241
+ return
242
+
243
+ if check_if_generation_is_currently_active():
244
+ return
245
+
246
+ tts_manager = get_text_to_speech_manager()
247
+ if tts_manager is not None:
248
+ tts_manager.evict_least_recently_used_voice_states()
249
+
250
+ memory_cleanup()
251
+ current_memory_usage = get_current_memory_usage()
252
+
253
+ if current_memory_usage <= MEMORY_IDLE_TARGET:
254
+ return
255
+
256
+ if check_if_generation_is_currently_active():
257
+ return
258
+
259
+ if tts_manager is not None:
260
+ tts_manager.clear_voice_state_cache_completely()
261
+
262
+ memory_cleanup()
263
+ current_memory_usage = get_current_memory_usage()
264
+
265
+ if current_memory_usage <= MEMORY_IDLE_TARGET:
266
+ return
267
+
268
+ if check_if_generation_is_currently_active():
269
+ return
270
+
271
+ if tts_manager is not None:
272
+ tts_manager.unload_model_completely()
273
+
274
+ memory_cleanup()
275
+
276
+ def perform_background_cleanup_cycle():
277
+ last_memory_check_timestamp = 0
278
+
279
+ while not background_cleanup_stop_event.is_set():
280
+ time_until_next_expiration = calculate_time_until_next_file_expiration()
281
+ current_timestamp = time.time()
282
+ time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
283
+
284
+ if time_until_next_expiration is not None:
285
+ if time_until_next_expiration <= 0:
286
+ wait_duration = 1
287
+ else:
288
+ wait_duration = min(
289
+ time_until_next_expiration + 1,
290
+ MEMORY_CHECK_INTERVAL,
291
+ BACKGROUND_CLEANUP_INTERVAL
292
+ )
293
+ else:
294
+ if is_memory_above_idle_target() and not check_if_generation_is_currently_active():
295
+ wait_duration = MEMORY_CHECK_INTERVAL
296
+ else:
297
+ background_cleanup_trigger_event.clear()
298
+ triggered = background_cleanup_trigger_event.wait(timeout=BACKGROUND_CLEANUP_INTERVAL)
299
+
300
+ if background_cleanup_stop_event.is_set():
301
+ break
302
+
303
+ if triggered:
304
+ continue
305
+ else:
306
+ if not check_if_generation_is_currently_active():
307
+ perform_idle_memory_reduction()
308
+ continue
309
+
310
+ background_cleanup_stop_event.wait(timeout=wait_duration)
311
+
312
+ if background_cleanup_stop_event.is_set():
313
+ break
314
+
315
+ if has_temporary_files_pending_cleanup():
316
+ cleanup_expired_temporary_files()
317
+
318
+ current_timestamp = time.time()
319
+ time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
320
+
321
+ if time_since_last_memory_check >= MEMORY_CHECK_INTERVAL:
322
+ if not check_if_generation_is_currently_active():
323
+ if is_memory_usage_critical():
324
+ enforce_memory_limit_if_exceeded()
325
+ elif is_memory_above_idle_target():
326
+ perform_idle_memory_reduction()
327
+
328
+ last_memory_check_timestamp = current_timestamp
329
+
330
+ def trigger_background_cleanup_check():
331
+ background_cleanup_trigger_event.set()
332
+
333
+ def start_background_cleanup_thread():
334
+ global background_cleanup_thread
335
+
336
+ from ..core import state as global_state
337
+
338
+ if global_state.background_cleanup_thread is None or not global_state.background_cleanup_thread.is_alive():
339
+ background_cleanup_stop_event.clear()
340
+ background_cleanup_trigger_event.clear()
341
+
342
+ global_state.background_cleanup_thread = threading.Thread(
343
+ target=perform_background_cleanup_cycle,
344
+ daemon=True,
345
+ name="BackgroundCleanupThread"
346
+ )
347
+
348
+ global_state.background_cleanup_thread.start()
349
+
350
+ def stop_background_cleanup_thread():
351
+ from ..core import state as global_state
352
+
353
+ background_cleanup_stop_event.set()
354
+ background_cleanup_trigger_event.set()
355
+
356
+ if global_state.background_cleanup_thread is not None and global_state.background_cleanup_thread.is_alive():
357
+ global_state.background_cleanup_thread.join(timeout=5)
358
+
359
+ atexit.register(stop_background_cleanup_thread)
src/core/state.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import threading
7
+
8
+ generation_state_lock = threading.Lock()
9
+ is_currently_generating = False
10
+ stop_generation_requested = False
11
+ temporary_files_registry = {}
12
+ temporary_files_lock = threading.Lock()
13
+ memory_enforcement_lock = threading.Lock()
14
+ background_cleanup_thread = None
15
+ background_cleanup_stop_event = threading.Event()
16
+ background_cleanup_trigger_event = threading.Event()
17
+ text_to_speech_manager = None
18
+
19
+ def set_text_to_speech_manager(manager_instance):
20
+ global text_to_speech_manager
21
+ text_to_speech_manager = manager_instance
22
+
23
+ def get_text_to_speech_manager():
24
+ global text_to_speech_manager
25
+ return text_to_speech_manager
26
+
27
+ def check_if_generation_is_currently_active():
28
+ with generation_state_lock:
29
+ return is_currently_generating
30
+
31
+ def set_generation_active(is_active):
32
+ global is_currently_generating
33
+ with generation_state_lock:
34
+ is_currently_generating = is_active
35
+
36
+ def set_stop_generation_requested(requested):
37
+ global stop_generation_requested
38
+ with generation_state_lock:
39
+ stop_generation_requested = requested
40
+
41
+ def get_stop_generation_requested():
42
+ with generation_state_lock:
43
+ return stop_generation_requested
src/generation/handler.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import gradio as gr
7
+ from config import VOICE_MODE_CLONE
8
+ from ..core.state import (
9
+ generation_state_lock,
10
+ get_stop_generation_requested,
11
+ set_stop_generation_requested
12
+ )
13
+ from ..core.authentication import get_huggingface_token
14
+ from ..core.memory import (
15
+ has_temporary_files_pending_cleanup,
16
+ cleanup_expired_temporary_files,
17
+ perform_memory_cleanup,
18
+ memory_cleanup,
19
+ trigger_background_cleanup_check
20
+ )
21
+ from ..tts.manager import text_to_speech_manager
22
+ from ..validation.text import validate_text_input
23
+
24
+ def check_if_generating():
25
+ from ..core.state import is_currently_generating
26
+ with generation_state_lock:
27
+ return is_currently_generating
28
+
29
+ def request_generation_stop():
30
+ set_stop_generation_requested(True)
31
+ return gr.update(interactive=False)
32
+
33
+ def perform_speech_generation(
34
+ text_input,
35
+ voice_mode_selection,
36
+ voice_preset_selection,
37
+ voice_clone_audio_file,
38
+ model_variant,
39
+ lsd_decode_steps,
40
+ temperature,
41
+ noise_clamp,
42
+ eos_threshold,
43
+ frames_after_eos,
44
+ enable_custom_frames
45
+ ):
46
+ from ..core import state as global_state
47
+ if has_temporary_files_pending_cleanup():
48
+ cleanup_expired_temporary_files()
49
+
50
+ perform_memory_cleanup()
51
+
52
+ is_valid, validation_result = validate_text_input(text_input)
53
+
54
+ if not is_valid:
55
+ if validation_result:
56
+ raise gr.Error(validation_result)
57
+ raise gr.Error("Please enter valid text to generate speech.")
58
+
59
+ if voice_mode_selection == VOICE_MODE_CLONE:
60
+ if not voice_clone_audio_file:
61
+ raise gr.Error("Please upload an audio file for voice cloning.")
62
+ if not get_huggingface_token():
63
+ raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
64
+
65
+ with generation_state_lock:
66
+ if global_state.is_currently_generating:
67
+ raise gr.Error("A generation is already in progress. Please wait.")
68
+ global_state.is_currently_generating = True
69
+ global_state.stop_generation_requested = False
70
+
71
+ generated_audio_tensor = None
72
+ cloned_voice_state_tensor = None
73
+
74
+ try:
75
+ text_to_speech_manager.load_or_get_model(
76
+ model_variant,
77
+ temperature,
78
+ lsd_decode_steps,
79
+ noise_clamp,
80
+ eos_threshold
81
+ )
82
+
83
+ with generation_state_lock:
84
+ if global_state.stop_generation_requested:
85
+ return None
86
+
87
+ if voice_mode_selection == VOICE_MODE_CLONE:
88
+ cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
89
+ voice_state = cloned_voice_state_tensor
90
+ else:
91
+ voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
92
+
93
+ with generation_state_lock:
94
+ if global_state.stop_generation_requested:
95
+ return None
96
+
97
+ generated_audio_tensor = text_to_speech_manager.generate_audio(
98
+ validation_result,
99
+ voice_state,
100
+ frames_after_eos,
101
+ enable_custom_frames
102
+ )
103
+
104
+ with generation_state_lock:
105
+ if global_state.stop_generation_requested:
106
+ return None
107
+
108
+ output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)
109
+
110
+ return output_file_path
111
+
112
+ except gr.Error:
113
+ raise
114
+
115
+ except RuntimeError as runtime_error:
116
+ raise gr.Error(str(runtime_error))
117
+
118
+ except Exception as generation_error:
119
+ raise gr.Error(f"Speech generation failed: {str(generation_error)}")
120
+
121
+ finally:
122
+ with generation_state_lock:
123
+ global_state.is_currently_generating = False
124
+ global_state.stop_generation_requested = False
125
+
126
+ if generated_audio_tensor is not None:
127
+ del generated_audio_tensor
128
+ generated_audio_tensor = None
129
+
130
+ if cloned_voice_state_tensor is not None:
131
+ del cloned_voice_state_tensor
132
+ cloned_voice_state_tensor = None
133
+
134
+ memory_cleanup()
135
+ trigger_background_cleanup_check()
src/tts/manager.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import time
7
+ import tempfile
8
+ import threading
9
+ import torch
10
+ import scipy.io.wavfile
11
+ from pocket_tts import TTSModel
12
+ from config import (
13
+ AVAILABLE_VOICES,
14
+ DEFAULT_VOICE,
15
+ DEFAULT_MODEL_VARIANT,
16
+ DEFAULT_TEMPERATURE,
17
+ DEFAULT_LSD_DECODE_STEPS,
18
+ DEFAULT_EOS_THRESHOLD,
19
+ VOICE_STATE_CACHE_MAXIMUM_SIZE,
20
+ VOICE_STATE_CACHE_CLEANUP_THRESHOLD
21
+ )
22
+ from ..core.state import (
23
+ temporary_files_registry,
24
+ temporary_files_lock,
25
+ set_text_to_speech_manager
26
+ )
27
+ from ..core.memory import (
28
+ force_garbage_collection,
29
+ memory_cleanup,
30
+ perform_memory_cleanup,
31
+ trigger_background_cleanup_check,
32
+ is_memory_usage_approaching_limit
33
+ )
34
+ from ..audio.converter import convert_audio_to_pcm_wav
35
+
36
+ class TextToSpeechManager:
37
+ def __init__(self):
38
+ self.loaded_model = None
39
+ self.current_configuration = {}
40
+ self.voice_state_cache = {}
41
+ self.voice_state_cache_access_timestamps = {}
42
+ self.voice_state_cache_lock = threading.Lock()
43
+ self.model_lock = threading.Lock()
44
+
45
+ def is_model_loaded(self):
46
+ with self.model_lock:
47
+ return self.loaded_model is not None
48
+
49
+ def unload_model_completely(self):
50
+ with self.model_lock:
51
+ self.clear_voice_state_cache_completely()
52
+
53
+ if self.loaded_model is not None:
54
+ del self.loaded_model
55
+ self.loaded_model = None
56
+
57
+ self.current_configuration = {}
58
+
59
+ memory_cleanup()
60
+
61
+ def load_or_get_model(
62
+ self,
63
+ model_variant,
64
+ temperature,
65
+ lsd_decode_steps,
66
+ noise_clamp,
67
+ eos_threshold
68
+ ):
69
+ perform_memory_cleanup()
70
+
71
+ processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
72
+ processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
73
+ processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
74
+ processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
75
+ processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD
76
+
77
+ requested_configuration = {
78
+ "variant": processed_variant,
79
+ "temp": processed_temperature,
80
+ "lsd_decode_steps": processed_lsd_steps,
81
+ "noise_clamp": processed_noise_clamp,
82
+ "eos_threshold": processed_eos_threshold
83
+ }
84
+
85
+ with self.model_lock:
86
+ if self.loaded_model is None or self.current_configuration != requested_configuration:
87
+ if self.loaded_model is not None:
88
+ self.clear_voice_state_cache_completely()
89
+ del self.loaded_model
90
+ self.loaded_model = None
91
+ memory_cleanup()
92
+
93
+ self.loaded_model = TTSModel.load_model(**requested_configuration)
94
+ self.current_configuration = requested_configuration
95
+ self.voice_state_cache = {}
96
+
97
+ return self.loaded_model
98
+
99
+ def clear_voice_state_cache_completely(self):
100
+ with self.voice_state_cache_lock:
101
+ for voice_name in list(self.voice_state_cache.keys()):
102
+ voice_state_tensor = self.voice_state_cache.pop(voice_name, None)
103
+
104
+ if voice_state_tensor is not None:
105
+ del voice_state_tensor
106
+
107
+ self.voice_state_cache.clear()
108
+ self.voice_state_cache_access_timestamps.clear()
109
+
110
+ force_garbage_collection()
111
+
112
+ def evict_least_recently_used_voice_states(self):
113
+ with self.voice_state_cache_lock:
114
+ if len(self.voice_state_cache) <= VOICE_STATE_CACHE_CLEANUP_THRESHOLD:
115
+ if len(self.voice_state_cache) > 0:
116
+ sorted_voice_names_by_access_time = sorted(
117
+ self.voice_state_cache_access_timestamps.keys(),
118
+ key=lambda voice_name: self.voice_state_cache_access_timestamps[voice_name]
119
+ )
120
+
121
+ number_of_entries_to_remove = max(1, len(self.voice_state_cache) // 2)
122
+
123
+ for index in range(min(number_of_entries_to_remove, len(sorted_voice_names_by_access_time))):
124
+ voice_name_to_remove = sorted_voice_names_by_access_time[index]
125
+ voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
126
+ self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
127
+
128
+ if voice_state_tensor is not None:
129
+ del voice_state_tensor
130
+
131
+ force_garbage_collection()
132
+ return
133
+
134
+ sorted_voice_names_by_access_time = sorted(
135
+ self.voice_state_cache_access_timestamps.keys(),
136
+ key=lambda voice_name: self.voice_state_cache_access_timestamps[voice_name]
137
+ )
138
+
139
+ number_of_entries_to_remove = len(self.voice_state_cache) - VOICE_STATE_CACHE_CLEANUP_THRESHOLD
140
+
141
+ for index in range(number_of_entries_to_remove):
142
+ voice_name_to_remove = sorted_voice_names_by_access_time[index]
143
+ voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
144
+ self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
145
+
146
+ if voice_state_tensor is not None:
147
+ del voice_state_tensor
148
+
149
+ force_garbage_collection()
150
+
151
+ def get_voice_state_for_preset(self, voice_name):
152
+ validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE
153
+
154
+ with self.voice_state_cache_lock:
155
+ if validated_voice in self.voice_state_cache:
156
+ self.voice_state_cache_access_timestamps[validated_voice] = time.time()
157
+ return self.voice_state_cache[validated_voice]
158
+
159
+ if is_memory_usage_approaching_limit():
160
+ self.evict_least_recently_used_voice_states()
161
+
162
+ if len(self.voice_state_cache) >= VOICE_STATE_CACHE_MAXIMUM_SIZE:
163
+ self.evict_least_recently_used_voice_states()
164
+
165
+ with self.model_lock:
166
+ if self.loaded_model is None:
167
+ raise RuntimeError("TTS model is not loaded. Please try again.")
168
+
169
+ if validated_voice not in self.voice_state_cache:
170
+ computed_voice_state = self.loaded_model.get_state_for_audio_prompt(
171
+ audio_conditioning=validated_voice,
172
+ truncate=False
173
+ )
174
+
175
+ with self.voice_state_cache_lock:
176
+ self.voice_state_cache[validated_voice] = computed_voice_state
177
+ self.voice_state_cache_access_timestamps[validated_voice] = time.time()
178
+
179
+ return self.voice_state_cache[validated_voice]
180
+
181
+ def get_voice_state_for_clone(self, audio_file_path):
182
+ with self.model_lock:
183
+ if self.loaded_model is None:
184
+ raise RuntimeError("TTS model is not loaded. Please try again.")
185
+
186
+ converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
187
+
188
+ return self.loaded_model.get_state_for_audio_prompt(
189
+ audio_conditioning=converted_audio_path,
190
+ truncate=False
191
+ )
192
+
193
+ def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
194
+ with self.model_lock:
195
+ if self.loaded_model is None:
196
+ raise RuntimeError("TTS model is not loaded. Please try again.")
197
+
198
+ processed_frames = int(frames_after_eos) if enable_custom_frames else None
199
+
200
+ generated_audio = self.loaded_model.generate_audio(
201
+ model_state=voice_state,
202
+ text_to_generate=text_content,
203
+ frames_after_eos=processed_frames,
204
+ copy_state=True
205
+ )
206
+
207
+ force_garbage_collection()
208
+
209
+ return generated_audio
210
+
211
+ def save_audio_to_file(self, audio_tensor):
212
+ with self.model_lock:
213
+ if self.loaded_model is None:
214
+ raise RuntimeError("TTS model is not loaded. Cannot save audio.")
215
+
216
+ audio_sample_rate = self.loaded_model.sample_rate
217
+
218
+ audio_numpy_data = audio_tensor.numpy()
219
+
220
+ output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
221
+ scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)
222
+
223
+ with temporary_files_lock:
224
+ temporary_files_registry[output_file.name] = time.time()
225
+
226
+ trigger_background_cleanup_check()
227
+
228
+ return output_file.name
229
+
230
+ text_to_speech_manager = TextToSpeechManager()
231
+ set_text_to_speech_manager(text_to_speech_manager)
src/ui/handlers.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import gradio as gr
7
+ from config import VOICE_MODE_PRESET, DEFAULT_VOICE
8
+ from ..validation.text import validate_text_input
9
+
10
+ def switch_to_generating_state(ui_state):
11
+ new_state = {"generating": True}
12
+
13
+ return (
14
+ gr.update(visible=False),
15
+ gr.update(visible=True, interactive=True),
16
+ gr.update(visible=False),
17
+ new_state
18
+ )
19
+
20
+ def switch_to_idle_state(text_content, ui_state):
21
+ new_state = {"generating": False}
22
+
23
+ has_text_content = bool(text_content and text_content.strip())
24
+ should_show_clear = has_text_content
25
+
26
+ is_valid_text, _ = validate_text_input(text_content)
27
+
28
+ return (
29
+ gr.update(visible=True, interactive=is_valid_text),
30
+ gr.update(visible=False),
31
+ gr.update(visible=should_show_clear),
32
+ new_state
33
+ )
34
+
35
+ def perform_clear_action():
36
+ return (
37
+ "",
38
+ None,
39
+ gr.update(visible=False),
40
+ VOICE_MODE_PRESET,
41
+ DEFAULT_VOICE,
42
+ None
43
+ )
44
+
45
+ def create_example_handler(example_text, example_voice):
46
+ def set_example_values():
47
+ return example_text, VOICE_MODE_PRESET, example_voice
48
+
49
+ return set_example_values
50
+
51
+ def format_example_button_label(example_text, example_voice, max_text_length=40):
52
+ truncated_text = (
53
+ example_text[:max_text_length] + "..."
54
+ if len(example_text) > max_text_length
55
+ else example_text
56
+ )
57
+
58
+ return f"[{example_voice}] {truncated_text}"
src/ui/state.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import gradio as gr
7
+ from config import MAXIMUM_INPUT_LENGTH, VOICE_MODE_CLONE
8
+ from ..validation.text import validate_text_input
9
+
10
+ def check_generate_button_state(text_content, ui_state):
11
+ if ui_state.get("generating", False):
12
+ return gr.update(interactive=False)
13
+
14
+ is_valid, _ = validate_text_input(text_content)
15
+
16
+ return gr.update(interactive=is_valid)
17
+
18
+ def calculate_character_count_display(text_content):
19
+ character_count = len(text_content) if text_content else 0
20
+
21
+ display_color = (
22
+ "var(--error-text-color)"
23
+ if character_count > MAXIMUM_INPUT_LENGTH
24
+ else "var(--body-text-color-subdued)"
25
+ )
26
+
27
+ return f"<div style='text-align: right; padding: 4px 0;'><span style='color: {display_color}; font-size: 0.85em;'>{character_count} / {MAXIMUM_INPUT_LENGTH}</span></div>"
28
+
29
+ def determine_clear_button_visibility(text_content, ui_state):
30
+ if ui_state.get("generating", False):
31
+ return gr.update(visible=False)
32
+
33
+ has_text_content = bool(text_content and text_content.strip())
34
+ should_show_clear = has_text_content
35
+
36
+ return gr.update(visible=should_show_clear)
37
+
38
+ def update_voice_mode_visibility(voice_mode_value):
39
+ if voice_mode_value == VOICE_MODE_CLONE:
40
+ return gr.update(visible=False), gr.update(visible=True)
41
+
42
+ else:
43
+ return gr.update(visible=True), gr.update(visible=False)
src/validation/text.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ from config import MAXIMUM_INPUT_LENGTH
7
+
8
+ def validate_text_input(text_content):
9
+ if not text_content or not isinstance(text_content, str):
10
+ return False, ""
11
+
12
+ cleaned_text = text_content.strip()
13
+
14
+ if not cleaned_text:
15
+ return False, ""
16
+
17
+ if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
18
+ return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."
19
+
20
+ return True, cleaned_text