Spaces:

Pedro13543
/

KokoroTTS_Zero_shot_voice_clone

Sleeping

App Files Files Community

KokoroTTS_Zero_shot_voice_clone / kokoro.py

Pedro13543

Upload 5 files

7d83ae4 6 months ago

raw

history blame contribute delete

30.4 kB


	import os
	import re
	import torch
	from langdetect import detect, LangDetectException

	#inyected Prosody predictor trained on japanese IPA principally
	class ProsodyLSTM(torch.nn.Module):
	def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.2):
	super().__init__()
	self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
	self.lstm = torch.nn.LSTM(
	embedding_dim,
	hidden_dim,
	num_layers=num_layers,
	batch_first=True,
	bidirectional=True,
	dropout=dropout
	)
	self.fc = torch.nn.Sequential(
	torch.nn.Linear(hidden_dim * 2, hidden_dim),
	torch.nn.Tanh(),
	torch.nn.Linear(hidden_dim, 1)
	)
	def forward(self, phoneme_ids):
	# phoneme_ids shape: (B, T_sequence)
	embedded = self.embedding(phoneme_ids)

	lstm_out, _ = self.lstm(embedded)

	# We take the mean of the LSTM outputs over the time dimension
	# This gives a single vector representing the whole sentence's prosody
	pooled_out = torch.mean(lstm_out, dim=1)

	# Predict a single speed value for the sentence
	speed = self.fc(pooled_out)
	return speed

	phoneme_vocab = torch.load(os.path.join("./", "phoneme_vocab.pt"))
	use_prosody_pred = True
	try:
	vocab_size = len(phoneme_vocab)
	device = "cpu" #for default and it's not too heavy the model
	model_Prosody = ProsodyLSTM(vocab_size=vocab_size).to(device)
	model_Prosody.load_state_dict(torch.load("./prosody_speed_lstm.pth", map_location=device))
	model_Prosody.eval()
	print("Loaded IPA prosody speed predictor")
	except:
	use_prosody_pred = False

	# Configure eSpeak (ensure paths are correctly set for your environment)
	def configure_espeak():
	"""Configure espeak-ng paths for Windows"""
	os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
	os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\Program Files\eSpeak NG\espeak-ng.exe"

	print("PHONEMIZER_ESPEAK_LIBRARY:", os.environ.get("PHONEMIZER_ESPEAK_LIBRARY"))
	print("PHONEMIZER_ESPEAK_PATH:", os.environ.get("PHONEMIZER_ESPEAK_PATH"))

	if not os.path.exists(os.environ["PHONEMIZER_ESPEAK_LIBRARY"]):
	raise FileNotFoundError(f"Could not find espeak library at {os.environ['PHONEMIZER_ESPEAK_LIBRARY']}")
	if not os.path.exists(os.environ["PHONEMIZER_ESPEAK_PATH"]):
	raise FileNotFoundError(f"Could not find espeak executable at {os.environ['PHONEMIZER_ESPEAK_PATH']}")

	# Call the configuration function for eSpeak
	if os.name == 'nt':
	configure_espeak()

	def split_num(num):
	num = num.group()
	if '.' in num:
	return num
	elif ':' in num:
	h, m = [int(n) for n in num.split(':')]
	if m == 0:
	return f"{h} o'clock"
	elif m < 10:
	return f'{h} oh {m}'
	return f'{h} {m}'
	year = int(num[:4])
	if year < 1100 or year % 1000 < 10:
	return num
	left, right = num[:2], int(num[2:4])
	s = 's' if num.endswith('s') else ''
	if 100 <= year % 1000 <= 999:
	if right == 0:
	return f'{left} hundred{s}'
	elif right < 10:
	return f'{left} oh {right}{s}'
	return f'{left} {right}{s}'

	def flip_money(m):
	m = m.group()
	bill = 'dollar' if m[0] == '$' else 'pound'
	if m[-1].isalpha():
	return f'{m[1:]} {bill}s'
	elif '.' not in m:
	s = '' if m[1:] == '1' else 's'
	return f'{m[1:]} {bill}{s}'
	b, c = m[1:].split('.')
	s = '' if b == '1' else 's'
	c = int(c.ljust(2, '0'))
	coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
	return f'{b} {bill}{s} and {c} {coins}'

	def point_num(num):
	a, b = num.group().split('.')
	return ' point '.join([a, ' '.join(b)])

	def normalize_text(text):
	text = text.replace(chr(8216), "'").replace(chr(8217), "'")
	text = text.replace('«', chr(8220)).replace('»', chr(8221))
	text = text.replace(chr(8220), '"').replace(chr(8221), '"')
	text = text.replace('(', '«').replace(')', '»')
	for a, b in zip('、。！，：；？', ',.!,:;?'):
	text = text.replace(a, b+' ')
	text = re.sub(r'[^\S \n]', ' ', text)
	text = re.sub(r' +', ' ', text)
	text = re.sub(r'(?<=\n) +(?=\n)', '', text)
	text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
	text = re.sub(r'\b(?:Mr\.\|MR\.(?= [A-Z]))', 'Mister', text)
	text = re.sub(r'\b(?:Ms\.\|MS\.(?= [A-Z]))', 'Miss', text)
	text = re.sub(r'\b(?:Mrs\.\|MRS\.(?= [A-Z]))', 'Mrs', text)
	text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
	text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
	text = re.sub(r'\d*\.\d+\|\b\d{4}s?\b\|(?<!:)\b(?:[1-9]\|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
	text = re.sub(r'(?<=\d),(?=\d)', '', text)
	text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred\| thousand\| (?:[bm]\|tr)illion)*\b\|[$£]\d+\.\d\d?\b', flip_money, text)
	text = re.sub(r'\d*\.\d+', point_num, text)
	text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
	text = re.sub(r'(?<=\d)S', ' S', text)
	text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
	text = re.sub(r"(?<=X')S\b", 's', text)
	text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
	text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
	return text.strip()

	def get_vocab():
	_pad = "$"
	_punctuation = ';:,.!?¡¿—…"«»“” '
	_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
	_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
	symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
	dicts = {}
	for i in range(len((symbols))):
	dicts[symbols[i]] = i
	return dicts

	VOCAB = get_vocab()
	def tokenize(ps):
	return [i for i in map(VOCAB.get, ps) if i is not None]


	# The dictionary of supported language codes for our phonemizer
	def get_phonemizer_lang_code(text):
	"""
	Detects the language of the input text and maps it to a language code
	compatible with the espeak-ng backend.

	Args:
	text: The text for language detection.

	Returns:
	A string with the espeak-ng language code, or None if detection fails.
	"""
	# A mapping from ISO 639-1 language codes (from langdetect) to
	# the more specific codes that espeak-ng uses.
	# You can find more espeak codes by running `espeak-ng --voices`.
	LANGDETECT_TO_ESPEAK_MAP = {
	'en': 'en-us', # English (US) OK
	'fr': 'fr-fr', # French OK
	'es': 'es', # Spanish (Latin America) OK
	'de': 'de', # German Not tested
	'it': 'it', # Italian OK
	'pt': 'pt-pt', # Portuguese (Portugal) Dosen't work sometimes, speak-ng problems
	'ru': 'ru', # Russian Not tested
	'zh-cn': 'cmn', # Mandarin (Chinese) OK
	'ja': 'ja', # Japanese OK,custom backend
	'ko': 'ko', # Korean not tested
	'ar': 'ar', # Arabic Dosen't work
	'hi': 'hi', # Hindi Dosen't work
	}

	try:
	# 1. Detect the 2-letter language code
	lang_code_2_letter = detect(text)

	# 2. Use the map to get the espeak-ng code.
	# If not in the map, fallback to the detected code itself,
	# as it might work for some languages.
	return LANGDETECT_TO_ESPEAK_MAP.get(lang_code_2_letter, lang_code_2_letter)

	except LangDetectException:
	# This error occurs for very short, ambiguous, or non-alphabetic strings.
	print(f"Warning: Could not detect language for text: '{text}'")
	return None

	# --- Main Phonemizer Function ---
	from phonemizer.phonemize import phonemize
	def speak_phonemizer(text: str) -> str:
	"""
	Takes a text, automatically detects its language, and converts it to
	phonemes using espeak-ng. Relies on OS environment variables to find
	the espeak-ng installation.

	Args:
	text: The text to be converted to phonemes.

	Returns:
	The phonemic representation of the text (IPA), or an error message if
	phonemization fails.
	"""
	# 1. Get the language code for the phonemizer
	lang_code = get_phonemizer_lang_code(text)

	if not lang_code:
	return f"Error: Could not determine a supported language for the text."

	print(f"-> Detected language: '{lang_code}' for text: \"{text[:50]}...\"")

	try:
	# 2. Use the detected language code to phonemize the text
	# The phonemize function will automatically use the environment variables:
	# PHONEMIZER_ESPEAK_LIBRARY and PHONEMIZER_ESPEAK_PATH
	phonemes = phonemize(
	text,
	language=lang_code,
	backend='espeak',
	strip=False, # Keep for the TTS
	preserve_punctuation=True,
	njobs=1 # Use a single job for better error reporting
	)
	return phonemes
	except RuntimeError as e:
	# This is often the error if the espeak library/executable isn't found
	print (f"Error during phonemization. This might mean espeak-ng is not found. "
	f"Please ensure 'espeak-ng' is installed and the environment variables "
	f"'PHONEMIZER_ESPEAK_PATH' and 'PHONEMIZER_ESPEAK_LIBRARY' are set correctly.\n"
	f"Original error: {e}")
	return text
	except Exception as e:
	print(f"Error during phonemization: {e}")
	return text #hopefully we can use this no? if is english letters

	def phonemize_TEXT_buggy(text, lang, norm=True):
	if norm:
	text = normalize_text(text)
	ps =speak_phonemizer(text)# phonemizers[lang].phonemize([text])
	#print(ps)
	ps = ps[0] if ps else ''
	# https://en.wiktionary.org/wiki/kokoro#English
	ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
	ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
	ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
	ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]\|$)', 'z', ps)
	if lang == 'a':
	ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
	ps = ''.join(filter(lambda p: p in VOCAB, ps))
	return ps.strip()

	# --- Example Usage ---

	def length_to_mask(lengths):
	mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
	mask = torch.gt(mask+1, lengths.unsqueeze(1))
	return mask

	@torch.no_grad()
	def forward_2(model, tokens, ref_s, speed):
	device = ref_s.device
	tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
	input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
	text_mask = length_to_mask(input_lengths).to(device)
	bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
	d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
	s = ref_s[:, 128:]
	d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
	x, _ = model.predictor.lstm(d)
	duration = model.predictor.duration_proj(x)
	duration = torch.sigmoid(duration).sum(axis=-1) / speed
	pred_dur = torch.round(duration).clamp(min=1).long()
	pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
	c_frame = 0
	for i in range(pred_aln_trg.size(0)):
	pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
	c_frame += pred_dur[0,i].item()
	en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
	F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
	t_en = model.text_encoder(tokens, input_lengths, text_mask)
	asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
	return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()


	ALIASES = {
	'en-us': 'a',
	'en-gb': 'b',
	'es': 'e',
	'fr-fr': 'f',
	'hi': 'h',
	'it': 'i',
	'pt-br': 'p',
	'ja': 'j',
	'zh': 'cmn',
	}




	def generate_safe_old(model, text, voicepack, lang='a', speed=1, ps=None): #lang is unused
	lc = get_phonemizer_lang_code(text)
	# print(lc) #debug
	if lc == "ja":
	ps = ps or g2pja(text)
	else:
	ps = ps or phonemize(text, lc) #speak-ng old unatural backend
	tokens = tokenize(ps)
	if not tokens:
	return None
	elif len(tokens) > 510:
	tokens = tokens[:509]
	print('Truncated to 510 tokens')
	ref_s = voicepack[len(tokens)]
	out = forward(model, tokens, ref_s, speed)
	ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
	return out, ps


	import numpy as np
	import ja

	# --- Assume these functions/variables exist from your project ---
	# from some_tts_library import forward, tokenize, VOCAB
	# from your_language_detection import get_phonemizer_lang_code
	# from old_phonemizer import phonemize

	# Initialize our advanced Japanese G2P engine
	g2pja = ja.JapaneseToIPA()


	def split_sentences(text):
	"""
	V2: A more aggressive and reliable sentence splitter.
	It replaces all terminators with a unique delimiter and then splits.
	"""
	# Replace newlines with the delimiter first, as they are the strongest boundary.
	text = text.replace('\n', '\|\|\|')
	# Replace all common sentence terminators with themselves plus the delimiter.
	# This keeps the original punctuation in the sentence.
	text = re.sub(r'([.!?。！？])', r'\g<0>\|\|\|', text)

	sentences = text.split('\|\|\|')
	return [s.strip() for s in sentences if s.strip()]
	# --- THE NEW AND IMPROVED INFERENCE FUNCTION ---
	@torch.no_grad
	def generate_safe(model, text, voicepack, lang=None, speed=1.0, ps=None, sample_rate=22050,norm=True):
	"""
	Synthesizes audio from text, with advanced handling for Japanese prosody,
	automatic speed inference, and smart chunking for very long inputs.

	Args:
	model: The TTS model.
	text (str): The input text.
	voicepack: The voice pack containing reference audio samples.
	lang (str): Language code (now auto-detected, largely unused).
	speed (float): A manual speed override. If not 1.0, this speed will be used
	for all chunks, ignoring the inferred speed.
	ps (str): Optional pre-computed phonemes. If provided, text processing is skipped.
	sample_rate (int): The sample rate of the output audio.

	Returns:
	A tuple of (numpy.ndarray, str) containing the full audio waveform and
	the complete phoneme string, or (None, None) on failure.
	"""
	if ps:
	# If phonemes are provided directly, use the old simple synthesis path
	# This is a single chunk, so truncation is still a risk here.
	print("Using pre-computed phonemes.")
	tokens = tokenize(ps)
	if not tokens: return None, None
	if len(tokens) > 510:
	print('Warning: Pre-computed phonemes truncated.')
	tokens = tokens[:509]
	ref_s = voicepack[len(tokens)]
	out = forward(model, tokens, ref_s, speed)
	return out, ps

	# --- Language Detection and Phoneme Generation ---
	# print(text)
	sentences = split_sentences(text)
	phoneme_data = []
	for sentence in sentences:

	lc = lang or get_phonemizer_lang_code(sentence)
	print(f"Detected language: {lc}, {sentence[:16]}")

	# ... inside the new generate_safe ...

	if lc == "ja":
	# It correctly expects a LIST of (ipa, speed) tuples here
	phoneme_data.extend(g2pja(sentence))
	else:
	# For non-Japanese, it CREATES a compatible list with one item

	if norm:
	sentence = normalize_text(sentence)
	ipa_line = speak_phonemizer(sentence)
	# phonemize_TEXT(text, lc)
	# print(ipa_line,sentence)
	phoneme_data.append((ipa_line, 1.0))

	if not phoneme_data:
	print("Error: Phonemizer returned no data.")
	return None, None

	# --- Smart Chunking and Sequential Synthesis ---
	all_audio_chunks = []
	full_phoneme_string = ""

	current_chunk_tokens = []
	chunk_lines_data = [] # To store (ipa, speed) for the current chunk

	for ipa_line, inferred_speed in phoneme_data:
	line_tokens = tokenize(ipa_line)
	# print(line_tokens)

	# Check if adding the new line would exceed the token limit
	if current_chunk_tokens and (len(current_chunk_tokens) + len(line_tokens)) > 200:
	# 1. Synthesize the current chunk before it gets too big
	print(f"Synthesizing a chunk of {len(current_chunk_tokens)} tokens...")
	ref_s = voicepack[min(len(current_chunk_tokens),509)]

	# Use manual speed if provided, otherwise use the inferred speed of the FIRST line in the chunk
	#not now, updated to use the mean instead
	if speed!= 1:
	speeds_in_chunk = [s for _, s in chunk_lines_data]
	chunk_speed = sum(speeds_in_chunk) / len(speeds_in_chunk) if speeds_in_chunk else speed
	phonemes = re.findall(r'ˌ\|ˈ\| \|[a-zɕʑçɸɲdʒɾɡɯː↗]+', " ".join(line_data[0] for line_data in chunk_lines_data))
	phoneme_ids = [phoneme_vocab.get(p.strip(), phoneme_vocab['<unk>']) for p in phonemes if p.strip()]

	predicted_speed = 1
	if phoneme_ids and use_prosody_pred:
	phoneme_tensor = torch.LongTensor(phoneme_ids).unsqueeze(0).to(device)
	predicted_speed_tensor = model_Prosody(phoneme_tensor)
	predicted_speed = predicted_speed_tensor.item()
	print(predicted_speed)

	chunk_speed = (speed + chunk_speed + predicted_speed )/3
	else:
	speeds_in_chunk = [s for _, s in chunk_lines_data]
	chunk_speed = sum(speeds_in_chunk) / len(speeds_in_chunk) if speeds_in_chunk else speed
	# 2. Convert IPA to tensor for the model
	phonemes = re.findall(r'ˌ\|ˈ\| \|[a-zɕʑçɸɲdʒɾɡɯː↗]+', " ".join(line_data[0] for line_data in chunk_lines_data))
	phoneme_ids = [phoneme_vocab.get(p.strip(), phoneme_vocab['<unk>']) for p in phonemes if p.strip()]

	predicted_speed = 1
	if phoneme_ids and use_prosody_pred:
	phoneme_tensor = torch.LongTensor(phoneme_ids).unsqueeze(0).to(device)
	predicted_speed_tensor = model_Prosody(phoneme_tensor)
	predicted_speed = predicted_speed_tensor.item()
	chunk_speed = predicted_speed
	print(predicted_speed)
	else:
	chunk_speed = (chunk_speed + predicted_speed) / 2

	print(f" -> Inferred mean chunk speed: {chunk_speed:.2f}")
	try:
	audio_chunk = forward(model, current_chunk_tokens, ref_s, chunk_speed)
	all_audio_chunks.append(audio_chunk)
	except:
	print("Problem with the forward, maybe the phonemes are too long for the model?")


	# Also append the IPA strings for the full phoneme output
	full_phoneme_string += " ".join(line_data[0] for line_data in chunk_lines_data) + " "

	# 2. Start a new chunk with the current line
	current_chunk_tokens = line_tokens
	chunk_lines_data = [(ipa_line, inferred_speed)]
	else:
	# 3. Add the current line to the existing chunk
	current_chunk_tokens.extend(line_tokens)
	chunk_lines_data.append((ipa_line, inferred_speed))

	# After the loop, synthesize any remaining lines in the last chunk
	if current_chunk_tokens:
	print(f"Synthesizing the final chunk of {len(current_chunk_tokens)} tokens...")
	ref_s = voicepack[len(current_chunk_tokens)]
	speeds_in_chunk = [s for _, s in chunk_lines_data]
	chunk_speed = sum(speeds_in_chunk) / len(speeds_in_chunk) if speeds_in_chunk else 1.0
	print(f" -> Inferred mean chunk speed: {chunk_speed:.2f}")

	audio_chunk = forward(model, current_chunk_tokens, ref_s, chunk_speed)
	all_audio_chunks.append(audio_chunk)
	full_phoneme_string += " ".join(line_data[0] for line_data in chunk_lines_data)

	if not all_audio_chunks:
	return None, None

	# --- Audio Concatenation ---
	# Create a short silence array (e.g., 0.25 seconds) to place between chunks
	silence_duration = 0.25
	silence = np.zeros(int(silence_duration * sample_rate))

	# Join the audio chunks with silence
	final_audio = np.concatenate([part for chunk in all_audio_chunks for part in (chunk, silence)][:-1])

	return final_audio, full_phoneme_string.strip()


	from collections import defaultdict


	# Assume the original helper functions exist:
	# phonemize, tokenize, forward, VOCAB

	def generate_batched(model, texts, voicepacks, lang='a', speed=1.0, batch_size=16,device="cuda"):
	"""
	Generates audio in batches by grouping texts of the same length.

	Args:
	model: The TTS model.
	texts (list[str]): A list of texts to synthesize.
	voicepacks (list[torch.Tensor]): A list of voicepacks, one for each text.
	lang (str): The language for phonemization.
	speed (float): The generation speed.
	batch_size (int): The maximum number of items to process in one GPU forward pass.

	Returns:
	list[tuple]: A list of (audio_tensor, phoneme_string) tuples in the original order.
	"""
	if not isinstance(texts, list):
	texts = [texts]
	voicepacks = [voicepacks]

	# --- 1. Pre-processing and Grouping ---
	# This part is still serial, but it's fast CPU work.
	grouped_inputs = defaultdict(list)
	for i, (text, voicepack) in enumerate(zip(texts, voicepacks)):
	# Original pre-processing
	ps = phonemize(text, lang)
	tokens = tokenize(ps)

	if not tokens:
	# Handle empty inputs if necessary
	continue
	if len(tokens) > 510:
	tokens = tokens[:510]
	# ps would also need to be truncated if you need it accurate
	print(f'Warning: Truncated input {i} to 510 tokens')

	token_len = len(tokens)

	# Group by length. Store everything needed for the batch.
	grouped_inputs[token_len].append({
	"original_index": i,
	"tokens": tokens,
	"voicepack": voicepack,
	"ps": ps
	})

	# --- 2. Batched Inference ---
	# We will process group by group.
	outputs = [None] * len(texts)
	print(f"Processing {len(texts)} items in {len(grouped_inputs)} length groups.")

	for token_len, items in grouped_inputs.items():
	# Get the single ref_s for this entire group
	# (We assume all items in a group can use the ref_s from the first item's voicepack)
	ref_s = items[0]["voicepack"][token_len].to(device)
	print(ref_s.shape)

	# Process the group in mini-batches to respect the user-defined batch_size
	for i in range(0, len(items), batch_size):
	batch_items = items[i:i + batch_size]
	current_batch_size = len(batch_items)

	# Prepare the batch tensors
	batch_tokens = [item["tokens"] for item in batch_items]
	batch_tokens_tensor = torch.LongTensor(batch_tokens).to(device) # Assuming model has a .device attribute

	# The ref_s is the same for all items in the batch, so we expand it
	# Shape goes from (D) -> (1, D) -> (B, D) to match the batch
	batch_ref_s = ref_s.unsqueeze(0).expand(current_batch_size, -1, -1, -1)

	# --- Call the batched forward pass ---
	# This is where the speedup happens
	with torch.no_grad():
	out_batch = forward(model, batch_tokens_tensor, batch_ref_s, speed)

	# --- 3. De-batching and Storing ---
	# Place results back in their original positions
	for j, item in enumerate(batch_items):
	original_index = item["original_index"]
	audio_out = out_batch[j] # The forward pass should return a batch of audio

	# The original `ps` was already calculated and stored
	ps_out = ''.join(next(k for k, v in VOCAB.items() if t == v) for t in item["tokens"])

	outputs[original_index] = (audio_out, ps_out)

	return outputs



	@torch.no_grad()
	def forward(model, tokens, ref_s, speed):
	# Device management
	device = ref_s.device
	# Tokenization
	tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
	input_lengths = torch.LongTensor([tokens.shape[-1]])

	# Text Mask
	text_mask = length_to_mask(input_lengths).to(device)
	# Predictor
	bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
	d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
	s = ref_s[:, 128:]
	d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

	# Fusion layers
	x, _ = model.predictor.lstm(d)
	duration = model.predictor.duration_proj(x)
	duration = torch.sigmoid(duration).sum(axis=-1) / speed

	# Prediction
	pred_dur = torch.round(duration).clamp(min=1).long()

	pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
	c_frame = 0
	for i in range(pred_aln_trg.size(0)):
	pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
	c_frame += pred_dur[0, i].item()

	# Decoder
	en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
	F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

	# Output
	t_en = model.text_encoder(tokens, input_lengths, text_mask)
	asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)

	return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu()#.numpy()


	# In kokoro.py

	@torch.no_grad()
	def forward_b(model, tokens, ref_s, speed):
	# =========================================================
	# START OF BATCH-COMPATIBLE REFACTOR
	# =========================================================

	# --- 1. Device and Input Shape Management ---
	device = ref_s.device

	# Check if the input is a batch or a single example
	if tokens.dim() == 1:
	# It's a single item, wrap it in a batch of 1
	tokens = tokens.unsqueeze(0)

	batch_size = tokens.shape[0]
	tokens=tokens.to(device)

	# --- 2. Tokenization and Lengths (BATCHED) ---
	# Create start/end padding for the entire batch
	zeros = torch.zeros((batch_size, 1), dtype=torch.long, device=device)
	tokens = torch.cat([zeros, tokens, zeros], dim=1)

	# Calculate lengths for each item in the batch
	input_lengths = torch.LongTensor([tokens.shape[-1]] * batch_size).to(device)

	# --- 3. Text Mask and Predictor (Mostly unchanged, already batch-compatible) ---
	text_mask = length_to_mask(input_lengths).to(device)
	bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
	d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

	# s is the reference signal, it should already be batched
	# If ref_s was (D) it became (B,D). If it was (1,D) it became (B,D).
	# This is handled by the `expand` in `generate_batched`.
	s = ref_s[:, 128:]

	d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

	# --- 4. Fusion and Duration Prediction (BATCHED) ---
	x, _ = model.predictor.lstm(d)
	duration = model.predictor.duration_proj(x)

	# Sum over the last dimension, not all axes. Keep the batch dimension.
	duration = torch.sigmoid(duration).sum(axis=-1) / speed
	pred_dur = torch.round(duration).clamp(min=1).long() # Shape: (batch_size, num_tokens)

	# --- 5. Alignment Matrix Construction (CRITICAL BATCHED REFACTOR) ---
	# This replaces the slow, single-item for-loop.

	# Get the total length of the generated audio for each item in the batch
	pred_len = pred_dur.sum(axis=1) # Shape: (batch_size)
	max_pred_len = pred_len.max().item()

	# Create the alignment matrix for the whole batch
	pred_aln_trg = torch.zeros((batch_size, tokens.shape[1], max_pred_len), dtype=torch.float, device=device)

	# Create a range tensor for positioning frames
	# This creates a "row index" for each frame in the output
	cumsum_dur = torch.cumsum(pred_dur, dim=1)
	# The starting frame for each token's duration
	starts = cumsum_dur - pred_dur

	# This is a vectorized way to create the alignment matrix without a Python loop
	for b in range(batch_size):
	for i in range(tokens.shape[1]):
	start, dur = starts[b, i], pred_dur[b, i]
	if dur > 0:
	pred_aln_trg[b, i, start:start + dur] = 1

	# --- 6. Decoder (Mostly unchanged) ---
	en = d.transpose(-1, -2) @ pred_aln_trg
	F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

	t_en = model.text_encoder(tokens, input_lengths, text_mask)
	asr = t_en @ pred_aln_trg

	# The decoder should handle batched inputs.
	# The output will be (batch_size, audio_length)
	output_batch = model.decoder(asr, F0_pred, N_pred, ref_s[:, :128])

	# Note: We can't .squeeze() and .cpu() here, because the `generate_batched`
	# function expects a batch of tensors. This will be handled in the main script.
	return output_batch

	# =========================================================
	# END OF BATCH-COMPATIBLE REFACTOR
	# =========================================================