Dia-1.6B

Sleeping

App Files Files Community

playmak3r commited on Oct 19

Commit

c476e57

1 Parent(s): 7a62e14

Fix output return type; install missing libs; Update DEFAULT_REF_TEXT; improve error handling in infer function

Browse files

Files changed (2) hide show

app.py +13 -7
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ logging.basicConfig(
 DEFAULT_REF_PATH = "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
 DEFAULT_GEN_TEXT = "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible."
 SAMPLES_PATH = os.path.join(os.getcwd(), "samples")
-DEFAULT_REF_TEXT = ""
 model = Dia.from_pretrained("nari-labs/Dia-1.6B-0626")
@@ -35,7 +35,7 @@ def transcribe(file_path: str):
 def infer(
     gen_text: str,
-    ref_text: str = "",
     ref_audio_path: str = DEFAULT_REF_PATH,
 ) -> tuple[int, np.ndarray]:
     """
@@ -45,12 +45,12 @@ def infer(
         ref_text (str): The text corresponding to the reference audio.
         ref_audio_path (str): The file path to the reference audio.
     Returns:
-        tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
     """
     if gen_text is None or not len(gen_text):
-        raise Exception("Please insert the new text to synthesize.")
-    #if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT: ref_text = ""
     if not len(ref_text):
         ref_text = transcribe(ref_audio_path)
@@ -58,6 +58,7 @@ def infer(
     gr.Info("Starting inference request!")
     gr.Info("Encoding reference...")
     output = model.generate(
         ref_text + gen_text,
         audio_prompt=ref_audio_path,
@@ -69,6 +70,11 @@ def infer(
         cfg_filter_top_k=50,
     )
     return (44100, output)
@@ -76,7 +82,7 @@ demo = gr.Interface(
     fn=infer,
     inputs=[
         gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
-        gr.Textbox(label="Reference Text (Optional)"),
         gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
     ],
     outputs=gr.Audio(type="numpy", label="Generated Speech"),
@@ -85,4 +91,4 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(allowed_paths=[SAMPLES_PATH], mcp_server=True, inbrowser=True)

 DEFAULT_REF_PATH = "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
 DEFAULT_GEN_TEXT = "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible."
 SAMPLES_PATH = os.path.join(os.getcwd(), "samples")
+DEFAULT_REF_TEXT = "That place in the distance, it's huge and dedicated to Lady Shah. It can only mean one thing. I have a hidden place close to the cloister where night orchids bloom."
 model = Dia.from_pretrained("nari-labs/Dia-1.6B-0626")
 def infer(
     gen_text: str,
+    ref_text: str = DEFAULT_REF_TEXT,
     ref_audio_path: str = DEFAULT_REF_PATH,
 ) -> tuple[int, np.ndarray]:
     """
         ref_text (str): The text corresponding to the reference audio.
         ref_audio_path (str): The file path to the reference audio.
     Returns:
+        tuple [int, np.ndarray]: A tuple containing the sample rate (44100) and the generated audio waveform as a numpy array.
     """
     if gen_text is None or not len(gen_text):
+        raise ValueError("Please insert the new text to synthesize.")
+    if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT: ref_text = ""
     if not len(ref_text):
         ref_text = transcribe(ref_audio_path)
     gr.Info("Starting inference request!")
     gr.Info("Encoding reference...")
+    # ndarray[Unknown, Unknown] | list[ndarray[Unknown, Unknown]]
     output = model.generate(
         ref_text + gen_text,
         audio_prompt=ref_audio_path,
         cfg_filter_top_k=50,
     )
+    if isinstance(output, list):
+        output = np.concatenate(output, axis=-1)  # Junta os pedaços de áudio
+    elif not isinstance(output, np.ndarray):
+        output = np.array(output, dtype=np.float32)
     return (44100, output)
     fn=infer,
     inputs=[
         gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
+        gr.Textbox(label="Reference Text (Optional)", value=DEFAULT_REF_TEXT),
         gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
     ],
     outputs=gr.Audio(type="numpy", label="Generated Speech"),
 )
 if __name__ == "__main__":
+    demo.queue(max_size=10).launch(allowed_paths=[SAMPLES_PATH], mcp_server=False, inbrowser=True)

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ soundfile>=0.13.1
 torchaudio>=2.0.0
 torch>=2.0.0
 gradio-dialogue>=0.0.4
-groq

 torchaudio>=2.0.0
 torch>=2.0.0
 gradio-dialogue>=0.0.4
+groq
+torchcodec