Spaces:

herimor
/

voxtream

Running on Zero

App Files Files Community

herimor commited on Oct 4

Commit

413c2da

1 Parent(s): ad1cf9e

Add download button

Browse files

Files changed (4) hide show

app.py +58 -12
gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n +0 -3
gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws +0 -3
gradio_cached_examples/16/log.csv +0 -3

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 # Disable PyTorch dynamo/inductor globally
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
@@ -14,6 +15,7 @@ import torch
 import spaces
 import gradio as gr
 import numpy as np
 from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
@@ -76,6 +78,11 @@ def float32_to_int16(audio_float32: np.ndarray) -> np.ndarray:
     return audio_int16
 @spaces.GPU
 def synthesize_fn(prompt_audio_path, prompt_text, target_text):
     if next(speech_generator.model.parameters()).device.type == "cpu":
@@ -87,7 +94,8 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
         speech_generator.device = "cuda"
     if not prompt_audio_path or not target_text:
-        return None
     stream = speech_generator.generate_stream(
         prompt_text=prompt_text,
         prompt_audio_path=Path(prompt_audio_path),
@@ -96,14 +104,16 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
     buffer = []
     buffer_len = 0
     for frame, _ in stream:
         buffer.append(frame)
         buffer_len += frame.shape[0]
         if buffer_len >= CHUNK_SIZE:
             audio = np.concatenate(buffer)
-            yield (config.mimi_sr, float32_to_int16(audio))
             # Reset buffer and length
             buffer = []
@@ -116,7 +126,22 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
         if nfade > 0:
             fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
             final[-nfade:] *= fade
-        yield (config.mimi_sr, float32_to_int16(final))
 def main():
@@ -150,6 +175,14 @@ def main():
                     interactive=False,
                     streaming=True,
                     autoplay=True,
                 )
         with gr.Row():
@@ -177,27 +210,33 @@ def main():
                 outputs=[validation_msg, submit_btn],
             )
-        # --- Wire up actions ---
         submit_btn.click(
-            fn=lambda a, p, t: None,  # clears the audio value
             inputs=[prompt_audio, prompt_text, target_text],
-            outputs=output_audio,
             show_progress="hidden",
         ).then(
             fn=synthesize_fn,
             inputs=[prompt_audio, prompt_text, target_text],
-            outputs=output_audio,
         )
         clear_btn.click(
-            fn=lambda: (None, "", "", None, gr.update(visible=False, value=""), gr.update(interactive=False)),
             inputs=[],
-            outputs=[prompt_audio, prompt_text, target_text, output_audio, validation_msg, submit_btn],
         )
         # --- Add Examples ---
         gr.Markdown("### Examples")
-        gr.Examples(
             examples=[
                 [
                     "assets/app/male.wav",
@@ -211,9 +250,16 @@ def main():
                 ],
             ],
             inputs=[prompt_audio, prompt_text, target_text],
-            outputs=output_audio,
             fn=synthesize_fn,
-            cache_examples=True,
         )
     demo.launch()

 import os
+import uuid
 # Disable PyTorch dynamo/inductor globally
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 import spaces
 import gradio as gr
 import numpy as np
+import soundfile as sf
 from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
     return audio_int16
+def _clear_outputs():
+    # clears the player + hides file (download btn mirrors file via .change)
+    return None, gr.update(value=None, visible=False)
 @spaces.GPU
 def synthesize_fn(prompt_audio_path, prompt_text, target_text):
     if next(speech_generator.model.parameters()).device.type == "cpu":
         speech_generator.device = "cuda"
     if not prompt_audio_path or not target_text:
+        return None, gr.update(value=None, visible=False)
     stream = speech_generator.generate_stream(
         prompt_text=prompt_text,
         prompt_audio_path=Path(prompt_audio_path),
     buffer = []
     buffer_len = 0
+    total_buffer = []
     for frame, _ in stream:
         buffer.append(frame)
+        total_buffer.append(frame)
         buffer_len += frame.shape[0]
         if buffer_len >= CHUNK_SIZE:
             audio = np.concatenate(buffer)
+            yield (config.mimi_sr, float32_to_int16(audio)), None
             # Reset buffer and length
             buffer = []
         if nfade > 0:
             fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
             final[-nfade:] *= fade
+        yield (config.mimi_sr, float32_to_int16(final)), None
+    # Save the full audio to a file for download
+    if len(total_buffer) > 0:
+        full_audio = np.concatenate(total_buffer)
+        nfade = min(int(config.mimi_sr * FADE_OUT_SEC), full_audio.shape[0])
+        if nfade > 0:
+            fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
+            full_audio[-nfade:] *= fade
+        file_path = f"/tmp/voxtream_{uuid.uuid4().hex}.wav"
+        sf.write(file_path, float32_to_int16(full_audio), config.mimi_sr)
+        yield None, gr.update(value=file_path, visible=True)
+    else:
+        yield None, gr.update(value=None, visible=False)
 def main():
                     interactive=False,
                     streaming=True,
                     autoplay=True,
+                    show_download_button=False,
+                    show_share_button=False,
+                )
+                # appears only when file is ready
+                download_btn = gr.DownloadButton(
+                    "Download audio",
+                    visible=False,
                 )
         with gr.Row():
                 outputs=[validation_msg, submit_btn],
             )
+        # clear outputs before streaming
         submit_btn.click(
+            fn=lambda a, p, t: (None, gr.update(value=None, visible=False)),
             inputs=[prompt_audio, prompt_text, target_text],
+            outputs=[output_audio, download_btn],
             show_progress="hidden",
         ).then(
             fn=synthesize_fn,
             inputs=[prompt_audio, prompt_text, target_text],
+            outputs=[output_audio, download_btn],
         )
         clear_btn.click(
+            fn=lambda: (
+                None, "", "",        # inputs
+                None,                # output_audio
+                gr.update(value=None, visible=False),  # download_btn
+                gr.update(visible=False, value=""),    # validation_msg
+                gr.update(interactive=False),          # submit_btn
+            ),
             inputs=[],
+            outputs=[prompt_audio, prompt_text, target_text, output_audio, download_btn, validation_msg, submit_btn],
         )
         # --- Add Examples ---
         gr.Markdown("### Examples")
+        ex = gr.Examples(
             examples=[
                 [
                     "assets/app/male.wav",
                 ],
             ],
             inputs=[prompt_audio, prompt_text, target_text],
+            outputs=[output_audio, download_btn],
             fn=synthesize_fn,
+            cache_examples=False,
+        )
+        ex.dataset.click(
+            fn=_clear_outputs,
+            inputs=[],
+            outputs=[output_audio, download_btn],
+            queue=False,
         )
     demo.launch()

gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac85b968e44a98af1e2f344ed56f68c700cd2b99a3c114d2552c66b2b6c2e957
-size 326444

gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7a15baf860116573dd4985238c7a05fe3120f3732b43bef7d8c8aa22e07b5fbd
-size 322604

gradio_cached_examples/16/log.csv DELETED Viewed

@@ -1,3 +0,0 @@
-Synthesized audio,flag,username,timestamp
-"{""path"": ""gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:00.957637
-"{""path"": ""gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:06.729484