Spaces:

Rajhuggingface4253
/

kitten

Running

App Files Files Community

Rajhuggingface4253 commited on 25 days ago

Commit

aeeb4b5

verified ·

1 Parent(s): 8d1ea02

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -49

app.py CHANGED Viewed

@@ -12,24 +12,21 @@ import nltk
 from pydub import AudioSegment
 # --- App Setup ---
-app = FastAPI(title="Kitten TTS API", version="0.2.0")
 # --- Model & Tokenizer Loading ---
-# Download NLTK data (one-time)
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
-# Load the TTS model once at startup
 print("Loading KittenTTS model...")
 model = KittenTTS("KittenML/kitten-tts-nano-0.2")
 print("Model loaded.")
-# List of available voices
 voices = [
-    "expr-voice-2-f", "expr-voice-2-m", "expr-voice-3-f", "expr-voice-3-m",
-    "expr-voice-4-f", "expr-voice-4-m", "expr-voice-5-f", "expr-voice-5-m"
 ]
 # --- Request Models ---
@@ -37,14 +34,42 @@ class SpeechRequest(BaseModel):
     input: str
     model: str = "kitten-nano-0.2"
     voice: str = "expr-voice-1-f"
-    speed: Optional[float] = 1.0  # Note: speed is not yet implemented
-    response_format: str = "mp3"  # Defaulting to mp3
-# --- Text Chunking Logic ---
-def get_text_chunks(text: str):
-    """Splits text into semantically safe sentences."""
     sentences = nltk.sent_tokenize(text)
-    return [sentence for sentence in sentences if sentence.strip()]
 # --- Streaming Generator Logic ---
 async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes, None]:
@@ -56,15 +81,6 @@ async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes,
         yield b""
         return
-    # 1. Define FFmpeg command for on-the-fly conversion
-    # -f s16le: Input is signed 16-bit little-endian (raw PCM from WAV)
-    # -ar 24000: KittenTTS sample rate is 24kHz
-    # -ac 1: Mono audio
-    # -i pipe:0: Read from stdin
-    # -f mp3: Output format is MP3
-    # -b:a 96k: 96 kbps bitrate (good for speech)
-    # -ar 44100: Resample to 44.1kHz for better compatibility (optional)
-    # pipe:1: Write to stdout
     ffmpeg_command = [
         "ffmpeg",
         "-f", "s16le",
@@ -77,17 +93,14 @@ async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes,
         "pipe:1"
     ]
-    # 2. Start the FFmpeg subprocess
     process = await asyncio.create_subprocess_exec(
         *ffmpeg_command,
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE  # Capture errors
     )
     try:
-        # 3. Create an asyncio task to read from stdout (non-blocking)
-        # This task will yield MP3 chunks as FFmpeg produces them.
         async def read_stdout():
             while True:
                 chunk = await process.stdout.read(4096)
@@ -97,53 +110,50 @@ async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes,
         stdout_reader = read_stdout()
-        # 4. Generate and feed WAV data into FFmpeg's stdin
         async def feed_stdin():
             try:
-                chunks = get_text_chunks(text)
-                for chunk in chunks:
-                    # Generate the raw WAV bytes for this chunk
-                    wav_bytes = model.generate(text=chunk, voice=voice)
-                    # Use pydub to strip the WAV header and get raw PCM data
                     wav_audio = AudioSegment.from_wav(io.BytesIO(wav_bytes))
                     raw_pcm_data = wav_audio.raw_data
-                    # Feed the raw PCM data into FFmpeg
                     process.stdin.write(raw_pcm_data)
                     await process.stdin.drain()
-                    # Small sleep to allow generation to be non-blocking
                     await asyncio.sleep(0.01)
             except Exception as e:
                 print(f"Error in feed_stdin: {e}")
             finally:
-                # Close stdin when done to signal end to FFmpeg
-                if not process.stdin.is_closing():
                     process.stdin.close()
                     await process.stdin.wait_closed()
-        # 5. Run the feeder and reader tasks concurrently
         feeder_task = asyncio.create_task(feed_stdin())
         async for mp3_chunk in stdout_reader:
             yield mp3_chunk
-        # Wait for feeder to finish
         await feeder_task
-        # Wait for FFmpeg to finish
         await process.wait()
-        # Check for FFmpeg errors
         stderr_data = await process.stderr.read()
         if stderr_data:
             print(f"FFmpeg stderr: {stderr_data.decode()}")
     except Exception as e:
         print(f"Streaming generator error: {e}")
-        # Clean up process if it's still running
         if process.returncode is None:
             process.terminate()
             await process.wait()
@@ -153,19 +163,13 @@ async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes,
 # --- API Endpoints ---
 @app.post("/v1/audio/speech")
 async def generate_speech(request: SpeechRequest):
-    """
-    Generates speech audio, streaming the response.
-    Supports 'mp3' (streaming) and 'wav' (blocking, file-stream).
-    """
-    # Validation
     if request.voice not in voices:
         raise HTTPException(status_code=400, detail=f"Voice must be one of {voices}")
-    if len(request.input) > 2000: # New, higher limit for streaming
         raise HTTPException(status_code=413, detail="Input text too long; max 2000 chars")
     try:
         if request.response_format == "mp3":
-            # The new TRUE-STREAMING MP3 path
             return StreamingResponse(
                 audio_stream_generator(text=request.input, voice=request.voice),
                 media_type="audio/mpeg",
@@ -184,7 +188,7 @@ async def generate_speech(request: SpeechRequest):
             return StreamingResponse(
                 iter_bytes(),
                 media_type="audio/wav",
-                headers={"Content-DIsiI've": "attachment; filename=speech.wav"}
             )
         else:
@@ -200,7 +204,7 @@ async def list_voices():
 @app.get("/")
 async def root():
-    return {"message": "Kitten TTS API v0.2.0 (True-Streaming Enabled)"}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from pydub import AudioSegment
 # --- App Setup ---
+app = FastAPI(title="Kitten TTS API", version="0.2.1") # Version bump
 # --- Model & Tokenizer Loading ---
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
 print("Loading KittenTTS model...")
 model = KittenTTS("KittenML/kitten-tts-nano-0.2")
 print("Model loaded.")
 voices = [
+    "expr-voice-1-f", "expr-voice-2-m", "expr-voice-3-f", "expr-voice-4-m",
+    "expr-voice-5-f", "expr-voice-6-m", "expr-voice-7-f", "expr-voice-8-m"
 ]
 # --- Request Models ---
     input: str
     model: str = "kitten-nano-0.2"
     voice: str = "expr-voice-1-f"
+    speed: Optional[float] = 1.0
+    response_format: str = "mp3"
+# --- --- --- --- --- --- --- --- --- ---
+# --- THIS IS THE FIX ---
+# --- --- --- --- --- --- --- --- --- ---
+def get_text_batches(text: str, min_batch_chars: int = 150):
+    """
+    Joins small NLTK sentences into larger, 'audio-safe' batches.
+    This prevents sending tiny chunks to ffmpeg, which fails to encode.
+    """
     sentences = nltk.sent_tokenize(text)
+    current_batch = ""
+    for sentence in sentences:
+        if not sentence.strip():
+            continue
+        # Add the sentence to the current batch
+        if current_batch:
+            current_batch += " " + sentence
+        else:
+            current_batch = sentence
+        # If the batch is large enough, yield it
+        if len(current_batch) >= min_batch_chars:
+            yield current_batch
+            current_batch = ""
+    # Yield any remaining text in the last batch
+    if current_batch.strip():
+        yield current_batch
+# --- --- --- --- --- --- --- --- --- ---
+# --- END OF FIX ---
+# --- --- --- --- --- --- --- --- --- ---
 # --- Streaming Generator Logic ---
 async def audio_stream_generator(text: str, voice: str) -> AsyncGenerator[bytes, None]:
         yield b""
         return
     ffmpeg_command = [
         "ffmpeg",
         "-f", "s16le",
         "pipe:1"
     ]
     process = await asyncio.create_subprocess_exec(
         *ffmpeg_command,
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
     )
     try:
         async def read_stdout():
             while True:
                 chunk = await process.stdout.read(4096)
         stdout_reader = read_stdout()
         async def feed_stdin():
             try:
+                # --- USE THE BATCHER ---
+                batches = get_text_batches(text)
+                for batch in batches:
+                    if not batch.strip():
+                        continue
+                    wav_bytes = model.generate(text=batch, voice=voice)
                     wav_audio = AudioSegment.from_wav(io.BytesIO(wav_bytes))
                     raw_pcm_data = wav_audio.raw_data
+                    if not raw_pcm_data:
+                        print(f"Warning: No audio data for batch: {batch}")
+                        continue
                     process.stdin.write(raw_pcm_data)
                     await process.stdin.drain()
                     await asyncio.sleep(0.01)
             except Exception as e:
                 print(f"Error in feed_stdin: {e}")
             finally:
+                if process.stdin and not process.stdin.is_closing():
                     process.stdin.close()
                     await process.stdin.wait_closed()
         feeder_task = asyncio.create_task(feed_stdin())
         async for mp3_chunk in stdout_reader:
             yield mp3_chunk
         await feeder_task
         await process.wait()
         stderr_data = await process.stderr.read()
         if stderr_data:
             print(f"FFmpeg stderr: {stderr_data.decode()}")
     except Exception as e:
         print(f"Streaming generator error: {e}")
         if process.returncode is None:
             process.terminate()
             await process.wait()
 # --- API Endpoints ---
 @app.post("/v1/audio/speech")
 async def generate_speech(request: SpeechRequest):
     if request.voice not in voices:
         raise HTTPException(status_code=400, detail=f"Voice must be one of {voices}")
+    if len(request.input) > 2000:
         raise HTTPException(status_code=413, detail="Input text too long; max 2000 chars")
     try:
         if request.response_format == "mp3":
             return StreamingResponse(
                 audio_stream_generator(text=request.input, voice=request.voice),
                 media_type="audio/mpeg",
             return StreamingResponse(
                 iter_bytes(),
                 media_type="audio/wav",
+                headers={"Content-Disposition": "attachment; filename=speech.wav"}
             )
         else:
 @app.get("/")
 async def root():
+    return {"message": "Kitten TTS API v0.2.1 (Batching Fix)"}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)