import gradio as gr import torch import librosa import numpy as np from transformers import WhisperProcessor, WhisperForConditionalGeneration import os import librosa import numpy as np #MODEL_NAME = "Ronaldodev/whisper-fon-v3-clean" MODEL_NAME = "Ronaldodev/whisper-fon-v4" TOKEN = os.getenv("HF_TOKEN") # ajoute ton token dans Secrets → HF_TOKEN device = "cuda" if torch.cuda.is_available() else "cpu" #processor = WhisperProcessor.from_pretrained(MODEL_NAME, token=TOKEN) #model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, token=TOKEN).to(device) processor = WhisperProcessor.from_pretrained("openai/whisper-small", token=TOKEN) model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, token=TOKEN).to(device) def transcribe(audio): if audio is None: return "Aucun fichier audio reçu." speech, sr = librosa.load(audio, sr=16000) inputs = processor(speech, sampling_rate=16000, return_tensors="pt").input_features.to(device) with torch.no_grad(): ids = model.generate( inputs, max_length=300, # sans language="fon" task="transcribe" # ok pour ASR )[0] return processor.decode(ids, skip_special_tokens=True) def transcribe_long(audio, chunk_seconds=30, overlap_seconds=5): if audio is None: return "Aucun audio fourni." speech, sr = librosa.load(audio, sr=16000) chunk_size = chunk_seconds * sr overlap = overlap_seconds * sr start = 0 full_text = "" while start < len(speech): end = min(start + chunk_size, len(speech)) chunk = speech[start:end] inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device) with torch.no_grad(): ids = model.generate(inputs, max_length=448)[0] text = processor.decode(ids, skip_special_tokens=True) full_text += text + " " start += chunk_size - overlap print(f"Chunk {start//(chunk_size - overlap)} / {len(speech)//(chunk_size - overlap)}") return full_text.strip() def transcribe_music(audio, segment_length=15): # 15 sec idéal y, sr = librosa.load(audio, sr=16000) total_dur = librosa.get_duration(y=y, sr=sr) step = segment_length * sr full_text = "" for start in range(0, len(y), step): end = start + step chunk = y[start:end] inp = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device) with torch.no_grad(): ids = model.generate(inp, max_length=300)[0] txt = processor.decode(ids, skip_special_tokens=True) print(f"Segment {start/sr:.1f}s → {txt[:40]}...") full_text += txt + " " return full_text.strip() demo = gr.Interface(fn=transcribe_long, inputs=gr.Audio(type="filepath"), outputs="text") demo.launch(share=True)