herimor commited on
Commit
413c2da
·
1 Parent(s): ad1cf9e

Add download button

Browse files
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  # Disable PyTorch dynamo/inductor globally
4
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
@@ -14,6 +15,7 @@ import torch
14
  import spaces
15
  import gradio as gr
16
  import numpy as np
 
17
 
18
  from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
19
 
@@ -76,6 +78,11 @@ def float32_to_int16(audio_float32: np.ndarray) -> np.ndarray:
76
  return audio_int16
77
 
78
 
 
 
 
 
 
79
  @spaces.GPU
80
  def synthesize_fn(prompt_audio_path, prompt_text, target_text):
81
  if next(speech_generator.model.parameters()).device.type == "cpu":
@@ -87,7 +94,8 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
87
  speech_generator.device = "cuda"
88
 
89
  if not prompt_audio_path or not target_text:
90
- return None
 
91
  stream = speech_generator.generate_stream(
92
  prompt_text=prompt_text,
93
  prompt_audio_path=Path(prompt_audio_path),
@@ -96,14 +104,16 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
96
 
97
  buffer = []
98
  buffer_len = 0
 
99
 
100
  for frame, _ in stream:
101
  buffer.append(frame)
 
102
  buffer_len += frame.shape[0]
103
 
104
  if buffer_len >= CHUNK_SIZE:
105
  audio = np.concatenate(buffer)
106
- yield (config.mimi_sr, float32_to_int16(audio))
107
 
108
  # Reset buffer and length
109
  buffer = []
@@ -116,7 +126,22 @@ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
116
  if nfade > 0:
117
  fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
118
  final[-nfade:] *= fade
119
- yield (config.mimi_sr, float32_to_int16(final))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
 
122
  def main():
@@ -150,6 +175,14 @@ def main():
150
  interactive=False,
151
  streaming=True,
152
  autoplay=True,
 
 
 
 
 
 
 
 
153
  )
154
 
155
  with gr.Row():
@@ -177,27 +210,33 @@ def main():
177
  outputs=[validation_msg, submit_btn],
178
  )
179
 
180
- # --- Wire up actions ---
181
  submit_btn.click(
182
- fn=lambda a, p, t: None, # clears the audio value
183
  inputs=[prompt_audio, prompt_text, target_text],
184
- outputs=output_audio,
185
  show_progress="hidden",
186
  ).then(
187
  fn=synthesize_fn,
188
  inputs=[prompt_audio, prompt_text, target_text],
189
- outputs=output_audio,
190
  )
191
 
192
  clear_btn.click(
193
- fn=lambda: (None, "", "", None, gr.update(visible=False, value=""), gr.update(interactive=False)),
 
 
 
 
 
 
194
  inputs=[],
195
- outputs=[prompt_audio, prompt_text, target_text, output_audio, validation_msg, submit_btn],
196
  )
197
 
198
  # --- Add Examples ---
199
  gr.Markdown("### Examples")
200
- gr.Examples(
201
  examples=[
202
  [
203
  "assets/app/male.wav",
@@ -211,9 +250,16 @@ def main():
211
  ],
212
  ],
213
  inputs=[prompt_audio, prompt_text, target_text],
214
- outputs=output_audio,
215
  fn=synthesize_fn,
216
- cache_examples=True,
 
 
 
 
 
 
 
217
  )
218
 
219
  demo.launch()
 
1
  import os
2
+ import uuid
3
 
4
  # Disable PyTorch dynamo/inductor globally
5
  os.environ["TORCHDYNAMO_DISABLE"] = "1"
 
15
  import spaces
16
  import gradio as gr
17
  import numpy as np
18
+ import soundfile as sf
19
 
20
  from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
21
 
 
78
  return audio_int16
79
 
80
 
81
+ def _clear_outputs():
82
+ # clears the player + hides file (download btn mirrors file via .change)
83
+ return None, gr.update(value=None, visible=False)
84
+
85
+
86
  @spaces.GPU
87
  def synthesize_fn(prompt_audio_path, prompt_text, target_text):
88
  if next(speech_generator.model.parameters()).device.type == "cpu":
 
94
  speech_generator.device = "cuda"
95
 
96
  if not prompt_audio_path or not target_text:
97
+ return None, gr.update(value=None, visible=False)
98
+
99
  stream = speech_generator.generate_stream(
100
  prompt_text=prompt_text,
101
  prompt_audio_path=Path(prompt_audio_path),
 
104
 
105
  buffer = []
106
  buffer_len = 0
107
+ total_buffer = []
108
 
109
  for frame, _ in stream:
110
  buffer.append(frame)
111
+ total_buffer.append(frame)
112
  buffer_len += frame.shape[0]
113
 
114
  if buffer_len >= CHUNK_SIZE:
115
  audio = np.concatenate(buffer)
116
+ yield (config.mimi_sr, float32_to_int16(audio)), None
117
 
118
  # Reset buffer and length
119
  buffer = []
 
126
  if nfade > 0:
127
  fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
128
  final[-nfade:] *= fade
129
+ yield (config.mimi_sr, float32_to_int16(final)), None
130
+
131
+ # Save the full audio to a file for download
132
+ if len(total_buffer) > 0:
133
+ full_audio = np.concatenate(total_buffer)
134
+ nfade = min(int(config.mimi_sr * FADE_OUT_SEC), full_audio.shape[0])
135
+ if nfade > 0:
136
+ fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
137
+ full_audio[-nfade:] *= fade
138
+
139
+ file_path = f"/tmp/voxtream_{uuid.uuid4().hex}.wav"
140
+ sf.write(file_path, float32_to_int16(full_audio), config.mimi_sr)
141
+
142
+ yield None, gr.update(value=file_path, visible=True)
143
+ else:
144
+ yield None, gr.update(value=None, visible=False)
145
 
146
 
147
  def main():
 
175
  interactive=False,
176
  streaming=True,
177
  autoplay=True,
178
+ show_download_button=False,
179
+ show_share_button=False,
180
+ )
181
+
182
+ # appears only when file is ready
183
+ download_btn = gr.DownloadButton(
184
+ "Download audio",
185
+ visible=False,
186
  )
187
 
188
  with gr.Row():
 
210
  outputs=[validation_msg, submit_btn],
211
  )
212
 
213
+ # clear outputs before streaming
214
  submit_btn.click(
215
+ fn=lambda a, p, t: (None, gr.update(value=None, visible=False)),
216
  inputs=[prompt_audio, prompt_text, target_text],
217
+ outputs=[output_audio, download_btn],
218
  show_progress="hidden",
219
  ).then(
220
  fn=synthesize_fn,
221
  inputs=[prompt_audio, prompt_text, target_text],
222
+ outputs=[output_audio, download_btn],
223
  )
224
 
225
  clear_btn.click(
226
+ fn=lambda: (
227
+ None, "", "", # inputs
228
+ None, # output_audio
229
+ gr.update(value=None, visible=False), # download_btn
230
+ gr.update(visible=False, value=""), # validation_msg
231
+ gr.update(interactive=False), # submit_btn
232
+ ),
233
  inputs=[],
234
+ outputs=[prompt_audio, prompt_text, target_text, output_audio, download_btn, validation_msg, submit_btn],
235
  )
236
 
237
  # --- Add Examples ---
238
  gr.Markdown("### Examples")
239
+ ex = gr.Examples(
240
  examples=[
241
  [
242
  "assets/app/male.wav",
 
250
  ],
251
  ],
252
  inputs=[prompt_audio, prompt_text, target_text],
253
+ outputs=[output_audio, download_btn],
254
  fn=synthesize_fn,
255
+ cache_examples=False,
256
+ )
257
+
258
+ ex.dataset.click(
259
+ fn=_clear_outputs,
260
+ inputs=[],
261
+ outputs=[output_audio, download_btn],
262
+ queue=False,
263
  )
264
 
265
  demo.launch()
gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac85b968e44a98af1e2f344ed56f68c700cd2b99a3c114d2552c66b2b6c2e957
3
- size 326444
 
 
 
 
gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a15baf860116573dd4985238c7a05fe3120f3732b43bef7d8c8aa22e07b5fbd
3
- size 322604
 
 
 
 
gradio_cached_examples/16/log.csv DELETED
@@ -1,3 +0,0 @@
1
- Synthesized audio,flag,username,timestamp
2
- "{""path"": ""gradio_cached_examples/16/Synthesized audio/95f83d950a0400b268bd/tmppmcwrg5n"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:00.957637
3
- "{""path"": ""gradio_cached_examples/16/Synthesized audio/b5933b8060d980ce1ea1/tmp339_glws"", ""url"": null, ""size"": null, ""orig_name"": null, ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2025-09-28 16:43:06.729484