kevinwang676
/

Cosyvoice2-dev

ONNX

Safetensors

Model card Files Files and versions

xet

Community

kevinwang676 commited on May 15

Commit

0211c0b

verified ·

1 Parent(s): 3043352

Update runtime/python/grpc/server.py

Browse files

Files changed (1) hide show

runtime/python/grpc/server.py +83 -57

runtime/python/grpc/server.py CHANGED Viewed

@@ -59,35 +59,66 @@ def _yield_audio(model_output):
         resp = cosyvoice_pb2.Response(tts_audio=pcm16.tobytes())
         yield resp
-import urllib.parse
 def _load_prompt_from_url(url: str, target_sr: int = 16_000) -> torch.Tensor:
-    """
-    Download *url* (wav / mp3 / flac) ➜ mono torch.FloatTensor [1, T] @ target_sr.
-    The temp file is removed before return.
-    """
     resp = requests.get(url, timeout=10)
-    resp.raise_for_status()
-    # keep the original extension so torchaudio picks the right decoder
-    #ext = os.path.splitext(urllib.parse.urlparse(url).path)[1] or ".tmp"
-    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
         f.write(resp.content)
-        tmp_path = f.name
     try:
-        wav, sr = torchaudio.load(tmp_path)        # handles wav / mp3 / flac
-        if wav.ndim > 1:
-            wav = wav.mean(dim=0, keepdim=True)
-        if sr != target_sr:
-            wav = torchaudio.functional.resample(wav, sr, target_sr)
-        return wav                                  # [1, T] float32 in [-1,1]
     finally:
-        try:
-            os.remove(tmp_path)
-        except Exception as e:
-            logging.warning("Could not delete temp file %s: %s", tmp_path, e)
 # ────────────────────────────────────────────────────────────────────────────────
 # gRPC service
 # ────────────────────────────────────────────────────────────────────────────────
@@ -182,55 +213,50 @@ class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
             return
-        # 4. Instruction‑TTS (two flavours)
         if request.HasField("instruct_request"):
             ir = request.instruct_request
-            # ──────────────────────────────────────────────────────────────────
-            # 4‑a) instruct‑2  (has prompt_audio  →  bytes OR S3 URL)
-            # ──────────────────────────────────────────────────────────────────
-            if ir.HasField("prompt_audio"):
-                logging.info("Received instruct‑2 inference request")
-                tmp_path = None
-                try:
-                    if ir.prompt_audio.startswith(b'http'):
-                        prompt = _load_prompt_from_url(ir.prompt_audio.decode('utf‑8'))
-                    else:
-                        # legacy raw‑bytes payload
-                        prompt = _bytes_to_tensor(ir.prompt_audio)
-                    speed = getattr(ir, "speed", 1.0)
-                    mo = self.cosyvoice.inference_instruct2(
-                        ir.tts_text,
-                        ir.instruct_text,
-                        prompt,
-                        stream=False,
-                        speed=speed
-                    )
-                finally:
-                    if tmp_path and os.path.exists(tmp_path):
-                        try:
-                            os.remove(tmp_path)
-                        except Exception as e:
-                            logging.warning("Could not remove temp file %s: %s",
-                                            tmp_path, e)
-            # ──────────────────────────────────────────────────────────────────
-            # 4‑b) classic instruct (speaker‑ID, no prompt audio)
-            # ──────────────────────────────────────────────────────────────────
             else:
-                logging.info("Received instruct inference request")
-                mo = self.cosyvoice.inference_instruct(
-                    ir.tts_text,
-                    ir.spk_id,
-                    ir.instruct_text
-                )
             yield from _yield_audio(mo)
             return
         # unknown request type
         context.abort(grpc.StatusCode.INVALID_ARGUMENT,
                       "Unsupported request type in oneof field.")

         resp = cosyvoice_pb2.Response(tts_audio=pcm16.tobytes())
         yield resp
+import os, io, tempfile, requests, torch, torchaudio
+from urllib.parse import urlparse
 def _load_prompt_from_url(url: str, target_sr: int = 16_000) -> torch.Tensor:
+    """Download an audio file from ``url`` (wav / mp3 / flac / ogg …),
+    convert it to mono, resample to ``target_sr`` if necessary,
+    and return a 1×T float‑tensor in the range ‑1…1."""
+    # ─── 1.  Download ────────────────────────────────────────────────────────────
     resp = requests.get(url, timeout=10)
+    if resp.status_code != 200:
+        raise HTTPException(status_code=400,
+                            detail=f"Failed to download audio from URL: {url}")
+    # Infer extension from URL *or* Content‑Type header
+    ext = os.path.splitext(urlparse(url).path)[1].lower()
+    if not ext and 'content-type' in resp.headers:
+        mime = resp.headers['content-type'].split(';')[0].strip()
+        ext = {
+            'audio/mpeg': '.mp3',
+            'audio/wav':  '.wav',
+            'audio/x-wav': '.wav',
+            'audio/flac': '.flac',
+            'audio/ogg':  '.ogg',
+            'audio/x-m4a': '.m4a',
+        }.get(mime, '.audio')            # generic fallback
+    with tempfile.NamedTemporaryFile(suffix=ext or '.audio', delete=False) as f:
         f.write(resp.content)
+        temp_path = f.name
+    # ─── 2.  Decode (torchaudio first, pydub fallback) ──────────────────────────
     try:
+        # Let torchaudio pick the right backend automatically
+        speech, sample_rate = torchaudio.load(temp_path)
+    except Exception:
+        # Fallback that works as long as ffmpeg is present
+        from pydub import AudioSegment
+        import numpy as np
+        seg = AudioSegment.from_file(temp_path)       # any ffmpeg‑supported format
+        seg = seg.set_channels(1)                     # force mono
+        sample_rate = seg.frame_rate
+        np_audio = np.array(seg.get_array_of_samples()).astype(np.float32)
+        # normalise to −1…1 based on sample width
+        np_audio /= float(1 << (8 * seg.sample_width - 1))
+        speech = torch.from_numpy(np_audio).unsqueeze(0)
     finally:
+        os.unlink(temp_path)
+    # ─── 3.  Ensure mono + correct sample‑rate ──────────────────────────────────
+    if speech.dim() > 1 and speech.size(0) > 1:
+        speech = speech.mean(dim=0, keepdim=True)     # average to mono
+    if sample_rate != target_sr:
+        speech = torchaudio.transforms.Resample(orig_freq=sample_rate,
+                                                new_freq=target_sr)(speech)
+    return speech
 # ────────────────────────────────────────────────────────────────────────────────
 # gRPC service
 # ────────────────────────────────────────────────────────────────────────────────
             return
+        # 4. Instruct‑2  (CosyVoice2 supports this variant only)
         if request.HasField("instruct_request"):
             ir = request.instruct_request
+            # ---- require that the descriptor contains the field -------------------
+            if 'prompt_audio' not in ir.DESCRIPTOR.fields_by_name:
+                context.abort(
+                    grpc.StatusCode.INVALID_ARGUMENT,
+                    "Server expects instruct‑2 proto with a 'prompt_audio' field."
+                )
+            # ---- make sure it is non‑empty (no HasField for proto3 scalars) -------
+            if len(ir.prompt_audio) == 0:
+                context.abort(
+                    grpc.StatusCode.INVALID_ARGUMENT,
+                    "'prompt_audio' must not be empty for instruct‑2 requests."
+                )
+            logging.info("Received instruct‑2 inference request")
+            # convert to bytes no matter what scalar type the proto uses
+            pa_bytes = (ir.prompt_audio.encode('utf-8') if isinstance(ir.prompt_audio, str)
+                        else ir.prompt_audio)
+            # URL vs raw bytes
+            if pa_bytes.startswith(b"http"):
+                prompt = _load_prompt_from_url(pa_bytes.decode('utf-8'))
             else:
+                prompt = _bytes_to_tensor(pa_bytes)
+            speed = getattr(ir, "speed", 1.0)
+            mo = self.cosyvoice.inference_instruct2(
+                ir.tts_text,
+                ir.instruct_text,
+                prompt,
+                stream=False,
+                speed=speed,
+            )
             yield from _yield_audio(mo)
             return
         # unknown request type
         context.abort(grpc.StatusCode.INVALID_ARGUMENT,
                       "Unsupported request type in oneof field.")