gaoyang07 commited on
Commit
83c9f49
·
1 Parent(s): 35fe75b

Add audio references using Git LFS

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,7 +1,621 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import argparse
2
+ import functools
3
+ import importlib.util
4
+ from pathlib import Path
5
+ import re
6
+ import time
7
+ import orjson
8
+
9
  import gradio as gr
10
+ import numpy as np
11
+ import torch
12
+ from transformers import AutoModel, AutoProcessor
13
+
14
+ # Disable the broken cuDNN SDPA backend
15
+ torch.backends.cuda.enable_cudnn_sdp(False)
16
+ # Keep these enabled as fallbacks
17
+ torch.backends.cuda.enable_flash_sdp(True)
18
+ torch.backends.cuda.enable_mem_efficient_sdp(True)
19
+ torch.backends.cuda.enable_math_sdp(True)
20
+
21
+ MODEL_PATH = "OpenMOSS-Team/MOSS-TTS"
22
+ DEFAULT_ATTN_IMPLEMENTATION = "auto"
23
+ DEFAULT_MAX_NEW_TOKENS = 4096
24
+ CONTINUATION_NOTICE = (
25
+ "Continuation mode is active. Make sure the reference audio transcript is prepended to the input text."
26
+ )
27
+
28
+ MODE_CLONE = "Clone"
29
+ MODE_CONTINUE = "Continuation"
30
+ MODE_CONTINUE_CLONE = "Continuation + Clone"
31
+ ZH_TOKENS_PER_CHAR = 3.098411951313033
32
+ EN_TOKENS_PER_CHAR = 0.8673376262755219
33
+ REFERENCE_AUDIO_DIR = Path(__file__).resolve().parent.parent / "assets" / "audio"
34
+ EXAMPLE_TEXTS_JSONL_PATH = Path(__file__).resolve().parent.parent / "assets" / "text" / "moss_tts_example_texts.jsonl"
35
+
36
+
37
+ def _parse_example_id(example_id: str) -> tuple[str, int] | None:
38
+ matched = re.fullmatch(r"(zh|en)/(\d+)", (example_id or "").strip())
39
+ if matched is None:
40
+ return None
41
+ return matched.group(1), int(matched.group(2))
42
+
43
+
44
+ def _resolve_reference_audio_path(language: str, index: int) -> Path | None:
45
+ stem_candidates = [f"reference_{language}_{index}"]
46
+ for stem in stem_candidates:
47
+ for ext in (".wav", ".mp3"):
48
+ audio_path = REFERENCE_AUDIO_DIR / f"{stem}{ext}"
49
+ if audio_path.exists():
50
+ return audio_path
51
+ return None
52
+
53
+
54
+ def build_example_rows() -> list[tuple[str, str, str]]:
55
+ rows: list[tuple[str, str, str]] = []
56
+
57
+ with open(EXAMPLE_TEXTS_JSONL_PATH, "rb") as f:
58
+ for line in f:
59
+ if not line.strip():
60
+ continue
61
+ sample = orjson.loads(line)
62
+ parsed = _parse_example_id(sample.get("id", ""))
63
+ if parsed is None:
64
+ continue
65
+
66
+ language, index = parsed
67
+ text = str(sample.get("text", "")).strip()
68
+ audio_path = _resolve_reference_audio_path(language, index)
69
+ if audio_path is None:
70
+ continue
71
+
72
+ rows.append((sample['role'], str(audio_path), text))
73
+
74
+ return rows
75
+
76
+
77
+ EXAMPLE_ROWS = build_example_rows()
78
+
79
+
80
+ @functools.lru_cache(maxsize=1)
81
+ def load_backend(model_path: str, device_str: str, attn_implementation: str):
82
+ device = torch.device(device_str if torch.cuda.is_available() else "cpu")
83
+ dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
84
+ resolved_attn_implementation = resolve_attn_implementation(
85
+ requested=attn_implementation,
86
+ device=device,
87
+ dtype=dtype,
88
+ )
89
+
90
+ processor = AutoProcessor.from_pretrained(
91
+ model_path,
92
+ trust_remote_code=True,
93
+ )
94
+ if hasattr(processor, "audio_tokenizer"):
95
+ processor.audio_tokenizer = processor.audio_tokenizer.to(device)
96
+
97
+ model_kwargs = {
98
+ "trust_remote_code": True,
99
+ "torch_dtype": dtype,
100
+ }
101
+ if resolved_attn_implementation:
102
+ model_kwargs["attn_implementation"] = resolved_attn_implementation
103
+
104
+ model = AutoModel.from_pretrained(model_path, **model_kwargs).to(device)
105
+ model.eval()
106
+
107
+ sample_rate = int(getattr(processor.model_config, "sampling_rate", 24000))
108
+ return model, processor, device, sample_rate
109
+
110
+
111
+ def resolve_attn_implementation(requested: str, device: torch.device, dtype: torch.dtype) -> str | None:
112
+ requested_norm = (requested or "").strip().lower()
113
+
114
+ if requested_norm in {"none"}:
115
+ return None
116
+
117
+ if requested_norm not in {"", "auto"}:
118
+ return requested
119
+
120
+ # Prefer FlashAttention 2 when package + device conditions are met.
121
+ if (
122
+ device.type == "cuda"
123
+ and importlib.util.find_spec("flash_attn") is not None
124
+ and dtype in {torch.float16, torch.bfloat16}
125
+ ):
126
+ major, _ = torch.cuda.get_device_capability(device)
127
+ if major >= 8:
128
+ return "flash_attention_2"
129
+
130
+ # CUDA fallback: use PyTorch SDPA kernels.
131
+ if device.type == "cuda":
132
+ return "sdpa"
133
+
134
+ # CPU fallback.
135
+ return "eager"
136
+
137
+
138
+ def detect_text_language(text: str) -> str:
139
+ zh_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
140
+ en_chars = len(re.findall(r"[A-Za-z]", text))
141
+ if zh_chars == 0 and en_chars == 0:
142
+ return "en"
143
+ return "zh" if zh_chars >= en_chars else "en"
144
+
145
+
146
+ def supports_duration_control(mode_with_reference: str) -> bool:
147
+ return mode_with_reference not in {MODE_CONTINUE, MODE_CONTINUE_CLONE}
148
+
149
+
150
+ def estimate_duration_tokens(text: str) -> tuple[str, int, int, int]:
151
+ normalized = text or ""
152
+ effective_len = max(len(normalized), 1)
153
+ language = detect_text_language(normalized)
154
+ factor = ZH_TOKENS_PER_CHAR if language == "zh" else EN_TOKENS_PER_CHAR
155
+ default_tokens = max(1, int(effective_len * factor))
156
+ min_tokens = max(1, int(default_tokens * 0.5))
157
+ max_tokens = max(min_tokens, int(default_tokens * 1.5))
158
+ return language, default_tokens, min_tokens, max_tokens
159
+
160
+
161
+ def update_duration_controls(
162
+ enabled: bool,
163
+ text: str,
164
+ current_tokens: float | int | None,
165
+ mode_with_reference: str,
166
+ ):
167
+ if not supports_duration_control(mode_with_reference):
168
+ return (
169
+ gr.update(visible=False),
170
+ "Duration control is disabled for Continuation modes.",
171
+ gr.update(value=False, interactive=False),
172
+ )
173
+
174
+ checkbox_update = gr.update(interactive=True)
175
+ if not enabled:
176
+ return gr.update(visible=False), "Duration control is disabled.", checkbox_update
177
+
178
+ language, default_tokens, min_tokens, max_tokens = estimate_duration_tokens(text)
179
+ # Slider is initialized with value=1 as a placeholder; treat it as "unset"
180
+ # so first-time estimation uses the computed default instead of clamping to min.
181
+ if current_tokens is None or int(current_tokens) == 1:
182
+ slider_value = default_tokens
183
+ else:
184
+ slider_value = int(current_tokens)
185
+ slider_value = max(min_tokens, min(max_tokens, slider_value))
186
+
187
+ language_label = "Chinese" if language == "zh" else "English"
188
+ hint = (
189
+ f"Duration control enabled | detected language: {language_label} | "
190
+ f"default={default_tokens}, range=[{min_tokens}, {max_tokens}]"
191
+ )
192
+ return (
193
+ gr.update(
194
+ visible=True,
195
+ minimum=min_tokens,
196
+ maximum=max_tokens,
197
+ value=slider_value,
198
+ step=1,
199
+ ),
200
+ hint,
201
+ checkbox_update,
202
+ )
203
+
204
+
205
+ def build_conversation(
206
+ text: str,
207
+ reference_audio: str | None,
208
+ mode_with_reference: str,
209
+ expected_tokens: int | None,
210
+ processor,
211
+ ):
212
+ text = (text or "").strip()
213
+ if not text:
214
+ raise ValueError("Please enter text to synthesize.")
215
+
216
+ user_kwargs = {"text": text}
217
+ if expected_tokens is not None:
218
+ user_kwargs["tokens"] = int(expected_tokens)
219
+
220
+ if not reference_audio:
221
+ conversations = [[processor.build_user_message(**user_kwargs)]]
222
+ return conversations, "generation", "Direct Generation"
223
+
224
+ if mode_with_reference == MODE_CLONE:
225
+ clone_kwargs = dict(user_kwargs)
226
+ clone_kwargs["reference"] = [reference_audio]
227
+ conversations = [[processor.build_user_message(**clone_kwargs)]]
228
+ return conversations, "generation", MODE_CLONE
229
+
230
+ if mode_with_reference == MODE_CONTINUE:
231
+ conversations = [
232
+ [
233
+ processor.build_user_message(**user_kwargs),
234
+ processor.build_assistant_message(audio_codes_list=[reference_audio]),
235
+ ]
236
+ ]
237
+ return conversations, "continuation", MODE_CONTINUE
238
+
239
+ continue_clone_kwargs = dict(user_kwargs)
240
+ continue_clone_kwargs["reference"] = [reference_audio]
241
+ conversations = [
242
+ [
243
+ processor.build_user_message(**continue_clone_kwargs),
244
+ processor.build_assistant_message(audio_codes_list=[reference_audio]),
245
+ ]
246
+ ]
247
+ return conversations, "continuation", MODE_CONTINUE_CLONE
248
+
249
+
250
+ def render_mode_hint(reference_audio: str | None, mode_with_reference: str):
251
+ if not reference_audio:
252
+ return "Current mode: **Direct Generation** (no reference audio uploaded)"
253
+ if mode_with_reference == MODE_CLONE:
254
+ return "Current mode: **Clone** (speaker timbre will be cloned from the reference audio)"
255
+ return f"Current mode: **{mode_with_reference}** \n> {CONTINUATION_NOTICE}"
256
+
257
+
258
+ def apply_example_selection(
259
+ mode_with_reference: str,
260
+ duration_control_enabled: bool,
261
+ duration_tokens: int,
262
+ evt: gr.SelectData,
263
+ ):
264
+ if evt is None or evt.index is None:
265
+ return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
266
+
267
+ if isinstance(evt.index, (tuple, list)):
268
+ row_idx = int(evt.index[0])
269
+ else:
270
+ row_idx = int(evt.index)
271
+
272
+ if row_idx < 0 or row_idx >= len(EXAMPLE_ROWS):
273
+ return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
274
+
275
+ _, audio_path, example_text = EXAMPLE_ROWS[row_idx]
276
+ duration_slider_update, duration_hint, duration_checkbox_update = update_duration_controls(
277
+ duration_control_enabled,
278
+ example_text,
279
+ duration_tokens,
280
+ mode_with_reference,
281
+ )
282
+ return (
283
+ audio_path,
284
+ example_text,
285
+ render_mode_hint(audio_path, mode_with_reference),
286
+ duration_slider_update,
287
+ duration_hint,
288
+ duration_checkbox_update,
289
+ )
290
+
291
+
292
+ def run_inference(
293
+ text: str,
294
+ reference_audio: str | None,
295
+ mode_with_reference: str,
296
+ duration_control_enabled: bool,
297
+ duration_tokens: int,
298
+ temperature: float,
299
+ top_p: float,
300
+ top_k: int,
301
+ repetition_penalty: float,
302
+ model_path: str,
303
+ device: str,
304
+ attn_implementation: str,
305
+ max_new_tokens: int,
306
+ ):
307
+ started_at = time.monotonic()
308
+ model, processor, torch_device, sample_rate = load_backend(
309
+ model_path=model_path,
310
+ device_str=device,
311
+ attn_implementation=attn_implementation,
312
+ )
313
+ duration_enabled = bool(duration_control_enabled and supports_duration_control(mode_with_reference))
314
+ expected_tokens = int(duration_tokens) if duration_enabled else None
315
+ conversations, mode, mode_name = build_conversation(
316
+ text=text,
317
+ reference_audio=reference_audio,
318
+ mode_with_reference=mode_with_reference,
319
+ expected_tokens=expected_tokens,
320
+ processor=processor,
321
+ )
322
+
323
+ batch = processor(conversations, mode=mode)
324
+ input_ids = batch["input_ids"].to(torch_device)
325
+ attention_mask = batch["attention_mask"].to(torch_device)
326
+
327
+ with torch.no_grad():
328
+ outputs = model.generate(
329
+ input_ids=input_ids,
330
+ attention_mask=attention_mask,
331
+ max_new_tokens=int(max_new_tokens),
332
+ audio_temperature=float(temperature),
333
+ audio_top_p=float(top_p),
334
+ audio_top_k=int(top_k),
335
+ audio_repetition_penalty=float(repetition_penalty),
336
+ )
337
+
338
+ messages = processor.decode(outputs)
339
+ if not messages or messages[0] is None:
340
+ raise RuntimeError("The model did not return a decodable audio result.")
341
+
342
+ audio = messages[0].audio_codes_list[0]
343
+ if isinstance(audio, torch.Tensor):
344
+ audio_np = audio.detach().float().cpu().numpy()
345
+ else:
346
+ audio_np = np.asarray(audio, dtype=np.float32)
347
+
348
+ if audio_np.ndim > 1:
349
+ audio_np = audio_np.reshape(-1)
350
+ audio_np = audio_np.astype(np.float32, copy=False)
351
+
352
+ elapsed = time.monotonic() - started_at
353
+ status = (
354
+ f"Done | mode: {mode_name} | elapsed: {elapsed:.2f}s | "
355
+ f"max_new_tokens={int(max_new_tokens)}, "
356
+ f"expected_tokens={expected_tokens if expected_tokens is not None else 'off'}, "
357
+ f"audio_temperature={float(temperature):.2f}, audio_top_p={float(top_p):.2f}, "
358
+ f"audio_top_k={int(top_k)}, audio_repetition_penalty={float(repetition_penalty):.2f}"
359
+ )
360
+ return (sample_rate, audio_np), status
361
+
362
+
363
+ def build_demo(args: argparse.Namespace):
364
+ custom_css = """
365
+ :root {
366
+ --bg: #f6f7f8;
367
+ --panel: #ffffff;
368
+ --ink: #111418;
369
+ --muted: #4d5562;
370
+ --line: #e5e7eb;
371
+ --accent: #0f766e;
372
+ }
373
+ .gradio-container {
374
+ background: linear-gradient(180deg, #f7f8fa 0%, #f3f5f7 100%);
375
+ color: var(--ink);
376
+ }
377
+ .app-card {
378
+ border: 1px solid var(--line);
379
+ border-radius: 16px;
380
+ background: var(--panel);
381
+ padding: 14px;
382
+ }
383
+ .app-title {
384
+ font-size: 22px;
385
+ font-weight: 700;
386
+ margin-bottom: 6px;
387
+ letter-spacing: 0.2px;
388
+ }
389
+ .app-subtitle {
390
+ color: var(--muted);
391
+ font-size: 14px;
392
+ margin-bottom: 8px;
393
+ }
394
+ #output_audio {
395
+ padding-bottom: 12px;
396
+ margin-bottom: 8px;
397
+ overflow: hidden !important;
398
+ }
399
+ #output_audio > .wrap {
400
+ overflow: hidden !important;
401
+ }
402
+ #output_audio audio {
403
+ margin-bottom: 6px;
404
+ }
405
+ #run-btn {
406
+ background: var(--accent);
407
+ border: none;
408
+ }
409
+ """
410
+
411
+ with gr.Blocks(title="MOSS-TTS Demo", css=custom_css) as demo:
412
+ gr.Markdown(
413
+ """
414
+ <div class="app-card">
415
+ <div class="app-title">MOSS-TTS</div>
416
+ <div class="app-subtitle">Minimal UI: Direct Generation, Clone, Continuation, Continuation + Clone</div>
417
+ </div>
418
+ """
419
+ )
420
+
421
+ with gr.Row(equal_height=False):
422
+ with gr.Column(scale=3):
423
+ text = gr.Textbox(
424
+ label="Text",
425
+ lines=9,
426
+ placeholder="Enter text to synthesize. In continuation modes, prepend the reference audio transcript.",
427
+ )
428
+ reference_audio = gr.Audio(
429
+ label="Reference Audio (Optional)",
430
+ type="filepath",
431
+ )
432
+ mode_with_reference = gr.Radio(
433
+ choices=[MODE_CLONE, MODE_CONTINUE, MODE_CONTINUE_CLONE],
434
+ value=MODE_CLONE,
435
+ label="Mode with Reference Audio",
436
+ info="If no reference audio is uploaded, Direct Generation will be used automatically.",
437
+ )
438
+ mode_hint = gr.Markdown(render_mode_hint(None, MODE_CLONE))
439
+ duration_control_enabled = gr.Checkbox(
440
+ value=False,
441
+ label="Enable Duration Control (Expected Audio Tokens)",
442
+ )
443
+ duration_tokens = gr.Slider(
444
+ minimum=1,
445
+ maximum=1,
446
+ step=1,
447
+ value=1,
448
+ label="expected_tokens",
449
+ visible=False,
450
+ )
451
+ duration_hint = gr.Markdown("Duration control is disabled.")
452
+
453
+ with gr.Accordion("Sampling Parameters (Audio)", open=True):
454
+ temperature = gr.Slider(
455
+ minimum=0.1,
456
+ maximum=3.0,
457
+ step=0.05,
458
+ value=1.7,
459
+ label="temperature",
460
+ )
461
+ top_p = gr.Slider(
462
+ minimum=0.1,
463
+ maximum=1.0,
464
+ step=0.01,
465
+ value=0.8,
466
+ label="top_p",
467
+ )
468
+ top_k = gr.Slider(
469
+ minimum=1,
470
+ maximum=200,
471
+ step=1,
472
+ value=25,
473
+ label="top_k",
474
+ )
475
+ repetition_penalty = gr.Slider(
476
+ minimum=0.8,
477
+ maximum=2.0,
478
+ step=0.05,
479
+ value=1.0,
480
+ label="repetition_penalty",
481
+ )
482
+ max_new_tokens = gr.Slider(
483
+ minimum=256,
484
+ maximum=8192,
485
+ step=128,
486
+ value=DEFAULT_MAX_NEW_TOKENS,
487
+ label="max_new_tokens",
488
+ )
489
+
490
+ run_btn = gr.Button("Generate Speech", variant="primary", elem_id="run-btn")
491
+
492
+ with gr.Column(scale=2):
493
+ output_audio = gr.Audio(label="Output Audio", type="numpy", elem_id="output_audio")
494
+ status = gr.Textbox(label="Status", lines=4, interactive=False)
495
+ examples_table = gr.Dataframe(
496
+ headers=["Reference Speech", "Example Text"],
497
+ value=[[name, text] for name, _, text in EXAMPLE_ROWS],
498
+ datatype=["str", "str"],
499
+ row_count=(len(EXAMPLE_ROWS), "fixed"),
500
+ col_count=(2, "fixed"),
501
+ interactive=False,
502
+ wrap=True,
503
+ label="Examples (click a row to fill inputs)",
504
+ )
505
+
506
+ reference_audio.change(
507
+ fn=render_mode_hint,
508
+ inputs=[reference_audio, mode_with_reference],
509
+ outputs=[mode_hint],
510
+ )
511
+ mode_with_reference.change(
512
+ fn=render_mode_hint,
513
+ inputs=[reference_audio, mode_with_reference],
514
+ outputs=[mode_hint],
515
+ )
516
+ duration_control_enabled.change(
517
+ fn=update_duration_controls,
518
+ inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
519
+ outputs=[duration_tokens, duration_hint, duration_control_enabled],
520
+ )
521
+ text.change(
522
+ fn=update_duration_controls,
523
+ inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
524
+ outputs=[duration_tokens, duration_hint, duration_control_enabled],
525
+ )
526
+ mode_with_reference.change(
527
+ fn=update_duration_controls,
528
+ inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
529
+ outputs=[duration_tokens, duration_hint, duration_control_enabled],
530
+ )
531
+ examples_table.select(
532
+ fn=apply_example_selection,
533
+ inputs=[mode_with_reference, duration_control_enabled, duration_tokens],
534
+ outputs=[
535
+ reference_audio,
536
+ text,
537
+ mode_hint,
538
+ duration_tokens,
539
+ duration_hint,
540
+ duration_control_enabled,
541
+ ],
542
+ )
543
+
544
+ run_btn.click(
545
+ fn=lambda text, reference_audio, mode_with_reference, duration_control_enabled, duration_tokens, temperature, top_p, top_k, repetition_penalty, max_new_tokens: run_inference(
546
+ text=text,
547
+ reference_audio=reference_audio,
548
+ mode_with_reference=mode_with_reference,
549
+ duration_control_enabled=duration_control_enabled,
550
+ duration_tokens=duration_tokens,
551
+ temperature=temperature,
552
+ top_p=top_p,
553
+ top_k=top_k,
554
+ repetition_penalty=repetition_penalty,
555
+ model_path=args.model_path,
556
+ device=args.device,
557
+ attn_implementation=args.attn_implementation,
558
+ max_new_tokens=max_new_tokens,
559
+ ),
560
+ inputs=[
561
+ text,
562
+ reference_audio,
563
+ mode_with_reference,
564
+ duration_control_enabled,
565
+ duration_tokens,
566
+ temperature,
567
+ top_p,
568
+ top_k,
569
+ repetition_penalty,
570
+ max_new_tokens,
571
+ ],
572
+ outputs=[output_audio, status],
573
+ )
574
+ return demo
575
+
576
+
577
+ def main():
578
+ parser = argparse.ArgumentParser(description="MossTTS Gradio Demo")
579
+ parser.add_argument("--model_path", type=str, default=MODEL_PATH)
580
+ parser.add_argument("--device", type=str, default="cuda:0")
581
+ parser.add_argument("--attn_implementation", type=str, default=DEFAULT_ATTN_IMPLEMENTATION)
582
+ parser.add_argument("--host", type=str, default="0.0.0.0")
583
+ parser.add_argument("--port", type=int, default=7860)
584
+ parser.add_argument("--share", action="store_true")
585
+ args = parser.parse_args()
586
+
587
+ runtime_device = torch.device(args.device if torch.cuda.is_available() else "cpu")
588
+ runtime_dtype = torch.bfloat16 if runtime_device.type == "cuda" else torch.float32
589
+ args.attn_implementation = resolve_attn_implementation(
590
+ requested=args.attn_implementation,
591
+ device=runtime_device,
592
+ dtype=runtime_dtype,
593
+ ) or "none"
594
+ print(f"[INFO] Using attn_implementation={args.attn_implementation}", flush=True)
595
+
596
+ # Preload model/processor at startup to avoid first-request cold start latency.
597
+ preload_started_at = time.monotonic()
598
+ print(
599
+ f"[Startup] Preloading backend: model={args.model_path}, device={args.device}, attn={args.attn_implementation}",
600
+ flush=True,
601
+ )
602
+ load_backend(
603
+ model_path=args.model_path,
604
+ device_str=args.device,
605
+ attn_implementation=args.attn_implementation,
606
+ )
607
+ print(
608
+ f"[Startup] Backend preload finished in {time.monotonic() - preload_started_at:.2f}s",
609
+ flush=True,
610
+ )
611
+
612
+ demo = build_demo(args)
613
+ demo.queue(max_size=16, default_concurrency_limit=1).launch(
614
+ server_name=args.host,
615
+ server_port=args.port,
616
+ share=args.share,
617
+ )
618
 
 
 
619
 
620
+ if __name__ == "__main__":
621
+ main()
assets/audio/audio/reference_en_0.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:488e0e6bb4e48a7eb861f8fc7763565587287cde152cbec141b952089b02b2ef
3
+ size 90594
assets/audio/audio/reference_en_1.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e4fad862ccb12a4ac609623fe6c275d167f7dcfc7866ca740f38ab169935c6
3
+ size 213836
assets/audio/audio/reference_en_2.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:582f8e1e3f3792d7b495e159c29a55bb95c4c46e90725c62807b4b12bf341603
3
+ size 322923
assets/audio/audio/reference_en_3.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bd9c6ffb765fda23297fae21725bc174a3092d9687c3606f11d00ae0df9fc1e
3
+ size 107943
assets/audio/audio/reference_zh_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5112b5e2bef2a727534af85da1e56048a5ab5552de7aa7cbb5f48b0fa4f5eec
3
+ size 448172
assets/audio/audio/reference_zh_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ff19c55d55a37dbbd550e6624a2faf6cfa7fd56a9594456b17fbe3838b2245
3
+ size 1480128
assets/audio/audio/reference_zh_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1686d3e2b1fe2f6b079cf6a41a9cd9ba31c8f9d3cfe03ff411dd0359641c0c8
3
+ size 505586
assets/audio/audio/reference_zh_3.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cffa7c5d91c28895caf51c38418af9651c82a4e16a8e4c04e10991bf80cc04cc
3
+ size 347949
assets/text/text/moss_tts_example_texts.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"id":"zh/0","language":"zh","role":"可爱的小女孩","text":"亲爱的你,\n你好呀。\n今天,我想用最认真、最温柔的声音,对你说一些重要的话。\n这些话,像一颗小小的星星,希望能在你的心里慢慢发光。"}
2
+ {"id":"zh/1","language":"zh","role":"吴俊全老师","text":"从1948年9月12日至1949年1月31日,连续组织了震惊世界的辽沈、淮海、平津三个大战役,这一百四十二个昼夜中,双方统帅部和各级指挥部所拍发的电码讯号错综交汇,织成一面无形的网,从大气层覆盖下来,于是便注定了中国的山川将会怎样排列,流云又当如何变幻。"}
3
+ {"id":"zh/2","language":"zh","role":"原神胡桃","text":"嘿——你在听吗?\n嗯,不说话也没关系啦,反正我已经习惯自言自语了。\n我是胡桃,往生堂第七十七代堂主。\n别紧张别紧张,我今天不是来“请你喝茶”的——至少现在还不是。\n很多人一听到“往生堂”,就皱起眉头,好像我一开口,空气都要凉三分。\n可你看啊,太阳每天都会落下,可谁会因此不喜欢黄昏呢?\n生与死也是一样的道理嘛——\n不是终点,而是换一条路走走。"}
4
+ {"id":"zh/3","language":"zh","role":"明星杨幂","text":"有些人喜欢被照顾,\n而我更习惯照亮自己。\n\n不是不需要依靠,\n只是明白——\n真正能陪你走到最后的,\n从来都不是运气。\n\n我见过凌晨四点的城市,\n也见过掌声散去后的安静。\n那些看起来毫不费力的从容,\n其实都藏着一次次咬牙坚持。"}
5
+ {"id":"en/0","language":"en","role":"Taylor Swift","text":"Tonight, I just want to take a second and breathe this in with you.\nBecause moments like this don’t happen by accident. They’re built—one lyric at a time, one late night at a time, one brave decision at a time. They’re built by people who keep showing up, even when life is loud, even when the world is heavy, even when they’re not sure anyone sees the effort they’re making."}
6
+ {"id":"en/1","language":"en","role":"Iron Man","text":"Look, I know what you’re thinking. Here he goes again. The guy in the metal suit, the walking ego with a repulsor problem, about to make a speech like it’s a press conference and I’m getting paid by the syllable. Relax. This one isn’t for the cameras. No sponsors, no applause, no clever angle that makes me look taller than I already am."}
7
+ {"id":"en/2","language":"en","role":"David Attenborough","text":"In the quiet hours before dawn, the world looks unfinished. Streets are empty, windows are dark, and the air holds its breath as if waiting for a cue. But beneath the stillness, everything is moving. Water is traveling through pipes. Electricity is humming along invisible lines. Seeds are pushing against soil. Somewhere, a hand reaches for a switch, and a day begins."}
8
+ {"id":"en/3","language":"en","role":"Rick Sanchez","text":"Look, you keep staring at the sky like it’s a customer service desk, waiting for the universe to hand you a receipt that says your pain was “worth it.” Newsflash: the cosmos doesn’t do refunds, it does entropy. It does random collisions of atoms that occasionally arrange themselves into a biped with anxiety and a subscription to self-importance."}
assets/text/text/moss_voice_generator_example_texts.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"id":"zh/0","language":"zh","instruction":"撕心裂肺,声泪俱下的中年女性","text":"皇上,臣妾做不到啊!皇上,您就杀了臣妾吧!"}
2
+ {"id":"zh/1","language":"zh","instruction":"年轻女性,开头傲慢不屑,发现对方身份后秒怂,疯狂道歉,惊慌失措","text":"你谁啊,关你什么事?啊…王总,您好您好,我不知道是您……"}
3
+ {"id":"zh/2","language":"zh","instruction":"疲惫沙哑的老年声音缓慢抱怨,带有轻微呻吟。","text":"哎呀,我的老腰啊,这年纪大了就是不行了。"}
4
+ {"id":"zh/3","language":"zh","instruction":"粗犷急躁的海盗船长,语速快,语调低沉而充满命令,带着一股不容置疑的霸道。","text":"快点!把那箱金币搬过来!速度快点!别磨磨蹭蹭的!我们必须在涨潮之前离开这里,否则就来不及了!"}
5
+ {"id":"en/0","language":"en","instruction":"Mom scolding kid for breaking a vase, then seeing he cut himself, shifting to concern","text":"How many times have I told you not to run in the house?! You could have…… oh honey, you're bleeding! Let me see your hand…… It's okay, baby."}
6
+ {"id":"en/1","language":"en","instruction":"An elderly female voice, slightly nasal and soft, speaking in a frail, polite British tone, conveying subtle discomfort with gentle hesitation.","text":"Achoo! Oh dear, I do believe I'm catching a cold. This dreadful weather is just too much."}
7
+ {"id":"en/2","language":"en","instruction":"Little girl, innocent and curious, high-pitched and adorable","text":"Mommy, why is the sky blue? And why do birds fly? And why-"}
8
+ {"id":"en/3","language":"en","instruction":"Emotional pop ballad with smooth, melodic delivery, slow tempo with gentle vibrato on sustained notes, conveying hope and vulnerability.","text":"Walking down this empty street tonight, searching for a guiding light, stars above shine oh so bright, everything will be alright"}