Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,31 +37,34 @@ def transcribe(audio):
|
|
| 37 |
|
| 38 |
return processor.decode(ids, skip_special_tokens=True)
|
| 39 |
|
| 40 |
-
def transcribe_long(audio, chunk_seconds=30):
|
| 41 |
if audio is None:
|
| 42 |
return "Aucun audio fourni."
|
| 43 |
|
| 44 |
-
# 1. Load audio
|
| 45 |
speech, sr = librosa.load(audio, sr=16000)
|
| 46 |
-
|
| 47 |
-
# 2. Découpage en chunks de 30s
|
| 48 |
chunk_size = chunk_seconds * sr
|
| 49 |
-
|
| 50 |
-
|
| 51 |
full_text = ""
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
|
| 54 |
|
| 55 |
with torch.no_grad():
|
| 56 |
-
ids = model.generate(inputs, max_length=
|
| 57 |
|
| 58 |
text = processor.decode(ids, skip_special_tokens=True)
|
| 59 |
-
full_text +=
|
| 60 |
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
return full_text.strip()
|
| 64 |
|
|
|
|
| 65 |
def transcribe_music(audio, segment_length=15): # 15 sec idéal
|
| 66 |
y, sr = librosa.load(audio, sr=16000)
|
| 67 |
|
|
|
|
| 37 |
|
| 38 |
return processor.decode(ids, skip_special_tokens=True)
|
| 39 |
|
| 40 |
+
def transcribe_long(audio, chunk_seconds=30, overlap_seconds=5):
|
| 41 |
if audio is None:
|
| 42 |
return "Aucun audio fourni."
|
| 43 |
|
|
|
|
| 44 |
speech, sr = librosa.load(audio, sr=16000)
|
|
|
|
|
|
|
| 45 |
chunk_size = chunk_seconds * sr
|
| 46 |
+
overlap = overlap_seconds * sr
|
| 47 |
+
start = 0
|
| 48 |
full_text = ""
|
| 49 |
+
|
| 50 |
+
while start < len(speech):
|
| 51 |
+
end = min(start + chunk_size, len(speech))
|
| 52 |
+
chunk = speech[start:end]
|
| 53 |
+
|
| 54 |
inputs = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
|
| 55 |
|
| 56 |
with torch.no_grad():
|
| 57 |
+
ids = model.generate(inputs, max_length=448)[0]
|
| 58 |
|
| 59 |
text = processor.decode(ids, skip_special_tokens=True)
|
| 60 |
+
full_text += text + " "
|
| 61 |
|
| 62 |
+
start += chunk_size - overlap
|
| 63 |
+
print(f"Chunk {start//(chunk_size - overlap)} / {len(speech)//(chunk_size - overlap)}")
|
| 64 |
|
| 65 |
return full_text.strip()
|
| 66 |
|
| 67 |
+
|
| 68 |
def transcribe_music(audio, segment_length=15): # 15 sec idéal
|
| 69 |
y, sr = librosa.load(audio, sr=16000)
|
| 70 |
|