Description
Website: https://moira-ai.com/
Email: moira.ai2024@gmail.com
Report: https://moiraai2024.github.io/GreekTTS-1.5-demo/
Welcome to Moira.AI GreekTTS-1.5, a state-of-the-art Greek text-to-speech system designed to deliver exceptional naturalness and intelligibility in speech synthesis. Building on our previous work in Greek TTS, GreekTTS-1.5 marks a significant leap forward in quality, accessibility, and performance.
GreekTTS-1.5 is built on the powerful Orpheus foundation model and fine-tuned using Low-Rank Adaptation (LoRA) — a parameter-efficient method that enables effective adaptation to a custom, high-quality Greek speech corpus. This results in a model that consistently outperforms existing baselines, offering fluid prosody, accurate pronunciation, and expressive speech generation.
Whether you're developing virtual assistants, audiobooks, accessibility tools, or any other application that requires natural-sounding Greek speech, GreekTTS-1.5 provides a high-fidelity solution ready for integration.
Key Features:
- Built on the robust Orpheus foundation model for high-quality performance.
- Fine-tuned using LoRA for efficient adaptation to Greek speech data.
- Produces natural, expressive, and intelligible Greek speech.
- Designed specifically for Greek — a low-resource language in TTS.
- Ideal for integration into applications requiring human-like speech synthesis.
- Open-source and extensible for future research and development.
Explore GreekTTS-1.5 and take your Greek TTS applications to the next level.
How to use it
https://docs.unsloth.ai/get-started/install-and-update/conda-install
conda create --name unsloth_env \
python=3.11 \
pytorch-cuda=12.1 \
pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
-y
conda activate unsloth_env
pip install unsloth
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from snac import SNAC
from IPython.display import display, Audio
import numpy as np
import locale
import scipy.io.wavfile
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
from unsloth import FastLanguageModel as FastModel
from peft import PeftModel
from IPython.display import Audio
# --- Define Constants and Configuration ---
print("\n⏳ Defining constants...")
# Model paths
BASE_MODEL_NAME = "unsloth/orpheus-3b-0.1-ft"
# 🔴 CRITICAL: UPDATE THIS PATH 🔴
# This must be the path to the LoRA adapters you saved during training.
# This should be inside the `output_dir` you set, e.g., "orpheus_training_dir/checkpoint-6000"
LORA_ADAPTERS_PATH = "training_dir_latest/checkpoint-1688"
# --- Load Your Fine-Tuned Orpheus Model ---
print("\n⏳ Loading models...")
# Load the base Orpheus model
model, _ = FastLanguageModel.from_pretrained(
model_name = BASE_MODEL_NAME,
max_seq_length = 2048,
dtype = None,
load_in_4bit = False,
)
# Load your fine-tuned LoRA adapters on top
model.load_adapter(LORA_ADAPTERS_PATH)
print("✅ Loaded fine-tuned LoRA adapters.")
# Load the Orpheus tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
# Load the SNAC model (the "Vocal Cords")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
# Special token IDs
tokeniser_length = 128256
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = 128263
start_of_text = 128000
end_of_text = 128009
print("✅ Constants defined.")
golden_test_set_prompts = [
"Στις 15 Μαΐου του 2024, το προϊόν κόστιζε 19,50€.",
"Έκανα post στο Instagram και μετά πήγα για shopping στο mall.",
"Ο λογαριασμός της Δ.Ε.Η. πρέπει να πληρωθεί, π.χ. μέσω τραπέζης.", # D.E.H. and p.x.
"Η εκστρατεία προσέλκυσε χιλιάδες εθελοντές.",
"Η Μαρία Παπαδοπούλου συνάντησε τον Γιάννη Οικονόμου.",
"Μια πάπια, μα ποια πάπια Μια πάπια με παπιά.",
"Ο παπάς ο παχύς, έφαγε παχιά φακή. Γιατί παπά παχύ, έφαγες παχιά φακή;",
"Άσπρη πέτρα ξέξασπρη κι απ' τον ήλιο ξεξασπρότερη.",
"Ο μπαμπάς πήγε στην αντάρα για να βρει τα αγκάθια.", # Tests μπ, ντ, γκ
"Οι τρεις ιερείς είδαν το υλικό.", # Tests ει, οι, υι (all sound like /i/)
"Έφαγα τζατζίκι και τσάι στην πλατεία.", # Tests τσ, τζ
"Ο νόμος είναι σαφής.", # NOmos (law)
"Ο νομός Αττικής είναι μεγάλος.", # noMOS (prefecture)
"Η παγκοσμιοποίηση επηρεάζει την οικονομία." # Tests stress on long words,
]
# --- Configure the Generation ---
def infer(prompts,chosen_voice):
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# Moving snac_model cuda to cpu
snac_model.to("cpu")
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
all_input_ids = []
for prompt in prompts_:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
all_input_ids.append(input_ids)
start_token = torch.tensor([[ 128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human
all_modified_input_ids = []
for input_ids in all_input_ids:
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
all_modified_input_ids.append(modified_input_ids)
all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids:
padding = max_length - modified_input_ids.shape[1]
padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
all_padded_tensors.append(padded_tensor)
all_attention_masks.append(attention_mask)
all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)
input_ids = all_padded_tensors.to("cuda")
attention_mask = all_attention_masks.to("cuda")
generated_ids = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
use_cache = True
)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove
processed_rows = []
for row in cropped_tensor:
masked_row = row[row != token_to_remove]
processed_rows.append(masked_row)
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
code_lists.append(trimmed_row)
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
# codes = [c.to("cuda") for c in codes]
audio_hat = snac_model.decode(codes)
return audio_hat
my_samples = []
for code_list in code_lists:
samples = redistribute_codes(code_list)
my_samples.append(samples)
from IPython.display import display, Audio
if len(prompts) != len(my_samples):
raise Exception("Number of prompts and samples do not match")
else:
for i in range(len(my_samples)):
print(prompts[i])
samples = my_samples[i]
display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
# Clean up to save RAM
del my_samples,samples
# --- Run infrence ---
for prompt in golden_test_set_prompts:
prompts = [prompt,]
print(prompts)
chosen_voice = None # None for single-speaker
infer(prompts,chosen_voice)
Model tree for moiraai2024/GreekTTS-1.5
Base model
meta-llama/Llama-3.2-3B-Instruct