Spaces:

OpenMOSS-Team
/

MOSS-TTS

Running on Zero

App Files Files Community

gaoyang07 commited on 5 days ago

Commit

83c9f49

1 Parent(s): 35fe75b

Add audio references using Git LFS

Browse files

Files changed (12) hide show

.gitattributes +2 -0
app.py +618 -4
assets/audio/audio/reference_en_0.mp3 +3 -0
assets/audio/audio/reference_en_1.mp3 +3 -0
assets/audio/audio/reference_en_2.mp3 +3 -0
assets/audio/audio/reference_en_3.mp3 +3 -0
assets/audio/audio/reference_zh_0.wav +3 -0
assets/audio/audio/reference_zh_1.wav +3 -0
assets/audio/audio/reference_zh_2.wav +3 -0
assets/audio/audio/reference_zh_3.mp3 +3 -0
assets/text/text/moss_tts_example_texts.jsonl +8 -0
assets/text/text/moss_voice_generator_example_texts.jsonl +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,7 +1,621 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import argparse
+import functools
+import importlib.util
+from pathlib import Path
+import re
+import time
+import orjson
 import gradio as gr
+import numpy as np
+import torch
+from transformers import AutoModel, AutoProcessor
+# Disable the broken cuDNN SDPA backend
+torch.backends.cuda.enable_cudnn_sdp(False)
+# Keep these enabled as fallbacks
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+torch.backends.cuda.enable_math_sdp(True)
+MODEL_PATH = "OpenMOSS-Team/MOSS-TTS"
+DEFAULT_ATTN_IMPLEMENTATION = "auto"
+DEFAULT_MAX_NEW_TOKENS = 4096
+CONTINUATION_NOTICE = (
+    "Continuation mode is active. Make sure the reference audio transcript is prepended to the input text."
+)
+MODE_CLONE = "Clone"
+MODE_CONTINUE = "Continuation"
+MODE_CONTINUE_CLONE = "Continuation + Clone"
+ZH_TOKENS_PER_CHAR = 3.098411951313033
+EN_TOKENS_PER_CHAR = 0.8673376262755219
+REFERENCE_AUDIO_DIR = Path(__file__).resolve().parent.parent / "assets" / "audio"
+EXAMPLE_TEXTS_JSONL_PATH = Path(__file__).resolve().parent.parent / "assets" / "text" / "moss_tts_example_texts.jsonl"
+def _parse_example_id(example_id: str) -> tuple[str, int] | None:
+    matched = re.fullmatch(r"(zh|en)/(\d+)", (example_id or "").strip())
+    if matched is None:
+        return None
+    return matched.group(1), int(matched.group(2))
+def _resolve_reference_audio_path(language: str, index: int) -> Path | None:
+    stem_candidates = [f"reference_{language}_{index}"]
+    for stem in stem_candidates:
+        for ext in (".wav", ".mp3"):
+            audio_path = REFERENCE_AUDIO_DIR / f"{stem}{ext}"
+            if audio_path.exists():
+                return audio_path
+    return None
+def build_example_rows() -> list[tuple[str, str, str]]:
+    rows: list[tuple[str, str, str]] = []
+    with open(EXAMPLE_TEXTS_JSONL_PATH, "rb") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            sample = orjson.loads(line)
+            parsed = _parse_example_id(sample.get("id", ""))
+            if parsed is None:
+                continue
+            language, index = parsed
+            text = str(sample.get("text", "")).strip()
+            audio_path = _resolve_reference_audio_path(language, index)
+            if audio_path is None:
+                continue
+            rows.append((sample['role'], str(audio_path), text))
+    return rows
+EXAMPLE_ROWS = build_example_rows()
+@functools.lru_cache(maxsize=1)
+def load_backend(model_path: str, device_str: str, attn_implementation: str):
+    device = torch.device(device_str if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
+    resolved_attn_implementation = resolve_attn_implementation(
+        requested=attn_implementation,
+        device=device,
+        dtype=dtype,
+    )
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+    )
+    if hasattr(processor, "audio_tokenizer"):
+        processor.audio_tokenizer = processor.audio_tokenizer.to(device)
+    model_kwargs = {
+        "trust_remote_code": True,
+        "torch_dtype": dtype,
+    }
+    if resolved_attn_implementation:
+        model_kwargs["attn_implementation"] = resolved_attn_implementation
+    model = AutoModel.from_pretrained(model_path, **model_kwargs).to(device)
+    model.eval()
+    sample_rate = int(getattr(processor.model_config, "sampling_rate", 24000))
+    return model, processor, device, sample_rate
+def resolve_attn_implementation(requested: str, device: torch.device, dtype: torch.dtype) -> str | None:
+    requested_norm = (requested or "").strip().lower()
+    if requested_norm in {"none"}:
+        return None
+    if requested_norm not in {"", "auto"}:
+        return requested
+    # Prefer FlashAttention 2 when package + device conditions are met.
+    if (
+        device.type == "cuda"
+        and importlib.util.find_spec("flash_attn") is not None
+        and dtype in {torch.float16, torch.bfloat16}
+    ):
+        major, _ = torch.cuda.get_device_capability(device)
+        if major >= 8:
+            return "flash_attention_2"
+    # CUDA fallback: use PyTorch SDPA kernels.
+    if device.type == "cuda":
+        return "sdpa"
+    # CPU fallback.
+    return "eager"
+def detect_text_language(text: str) -> str:
+    zh_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
+    en_chars = len(re.findall(r"[A-Za-z]", text))
+    if zh_chars == 0 and en_chars == 0:
+        return "en"
+    return "zh" if zh_chars >= en_chars else "en"
+def supports_duration_control(mode_with_reference: str) -> bool:
+    return mode_with_reference not in {MODE_CONTINUE, MODE_CONTINUE_CLONE}
+def estimate_duration_tokens(text: str) -> tuple[str, int, int, int]:
+    normalized = text or ""
+    effective_len = max(len(normalized), 1)
+    language = detect_text_language(normalized)
+    factor = ZH_TOKENS_PER_CHAR if language == "zh" else EN_TOKENS_PER_CHAR
+    default_tokens = max(1, int(effective_len * factor))
+    min_tokens = max(1, int(default_tokens * 0.5))
+    max_tokens = max(min_tokens, int(default_tokens * 1.5))
+    return language, default_tokens, min_tokens, max_tokens
+def update_duration_controls(
+    enabled: bool,
+    text: str,
+    current_tokens: float | int | None,
+    mode_with_reference: str,
+):
+    if not supports_duration_control(mode_with_reference):
+        return (
+            gr.update(visible=False),
+            "Duration control is disabled for Continuation modes.",
+            gr.update(value=False, interactive=False),
+        )
+    checkbox_update = gr.update(interactive=True)
+    if not enabled:
+        return gr.update(visible=False), "Duration control is disabled.", checkbox_update
+    language, default_tokens, min_tokens, max_tokens = estimate_duration_tokens(text)
+    # Slider is initialized with value=1 as a placeholder; treat it as "unset"
+    # so first-time estimation uses the computed default instead of clamping to min.
+    if current_tokens is None or int(current_tokens) == 1:
+        slider_value = default_tokens
+    else:
+        slider_value = int(current_tokens)
+        slider_value = max(min_tokens, min(max_tokens, slider_value))
+    language_label = "Chinese" if language == "zh" else "English"
+    hint = (
+        f"Duration control enabled | detected language: {language_label} | "
+        f"default={default_tokens}, range=[{min_tokens}, {max_tokens}]"
+    )
+    return (
+        gr.update(
+            visible=True,
+            minimum=min_tokens,
+            maximum=max_tokens,
+            value=slider_value,
+            step=1,
+        ),
+        hint,
+        checkbox_update,
+    )
+def build_conversation(
+    text: str,
+    reference_audio: str | None,
+    mode_with_reference: str,
+    expected_tokens: int | None,
+    processor,
+):
+    text = (text or "").strip()
+    if not text:
+        raise ValueError("Please enter text to synthesize.")
+    user_kwargs = {"text": text}
+    if expected_tokens is not None:
+        user_kwargs["tokens"] = int(expected_tokens)
+    if not reference_audio:
+        conversations = [[processor.build_user_message(**user_kwargs)]]
+        return conversations, "generation", "Direct Generation"
+    if mode_with_reference == MODE_CLONE:
+        clone_kwargs = dict(user_kwargs)
+        clone_kwargs["reference"] = [reference_audio]
+        conversations = [[processor.build_user_message(**clone_kwargs)]]
+        return conversations, "generation", MODE_CLONE
+    if mode_with_reference == MODE_CONTINUE:
+        conversations = [
+            [
+                processor.build_user_message(**user_kwargs),
+                processor.build_assistant_message(audio_codes_list=[reference_audio]),
+            ]
+        ]
+        return conversations, "continuation", MODE_CONTINUE
+    continue_clone_kwargs = dict(user_kwargs)
+    continue_clone_kwargs["reference"] = [reference_audio]
+    conversations = [
+        [
+            processor.build_user_message(**continue_clone_kwargs),
+            processor.build_assistant_message(audio_codes_list=[reference_audio]),
+        ]
+    ]
+    return conversations, "continuation", MODE_CONTINUE_CLONE
+def render_mode_hint(reference_audio: str | None, mode_with_reference: str):
+    if not reference_audio:
+        return "Current mode: **Direct Generation** (no reference audio uploaded)"
+    if mode_with_reference == MODE_CLONE:
+        return "Current mode: **Clone** (speaker timbre will be cloned from the reference audio)"
+    return f"Current mode: **{mode_with_reference}**  \n> {CONTINUATION_NOTICE}"
+def apply_example_selection(
+    mode_with_reference: str,
+    duration_control_enabled: bool,
+    duration_tokens: int,
+    evt: gr.SelectData,
+):
+    if evt is None or evt.index is None:
+        return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+    if isinstance(evt.index, (tuple, list)):
+        row_idx = int(evt.index[0])
+    else:
+        row_idx = int(evt.index)
+    if row_idx < 0 or row_idx >= len(EXAMPLE_ROWS):
+        return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+    _, audio_path, example_text = EXAMPLE_ROWS[row_idx]
+    duration_slider_update, duration_hint, duration_checkbox_update = update_duration_controls(
+        duration_control_enabled,
+        example_text,
+        duration_tokens,
+        mode_with_reference,
+    )
+    return (
+        audio_path,
+        example_text,
+        render_mode_hint(audio_path, mode_with_reference),
+        duration_slider_update,
+        duration_hint,
+        duration_checkbox_update,
+    )
+def run_inference(
+    text: str,
+    reference_audio: str | None,
+    mode_with_reference: str,
+    duration_control_enabled: bool,
+    duration_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    model_path: str,
+    device: str,
+    attn_implementation: str,
+    max_new_tokens: int,
+):
+    started_at = time.monotonic()
+    model, processor, torch_device, sample_rate = load_backend(
+        model_path=model_path,
+        device_str=device,
+        attn_implementation=attn_implementation,
+    )
+    duration_enabled = bool(duration_control_enabled and supports_duration_control(mode_with_reference))
+    expected_tokens = int(duration_tokens) if duration_enabled else None
+    conversations, mode, mode_name = build_conversation(
+        text=text,
+        reference_audio=reference_audio,
+        mode_with_reference=mode_with_reference,
+        expected_tokens=expected_tokens,
+        processor=processor,
+    )
+    batch = processor(conversations, mode=mode)
+    input_ids = batch["input_ids"].to(torch_device)
+    attention_mask = batch["attention_mask"].to(torch_device)
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=int(max_new_tokens),
+            audio_temperature=float(temperature),
+            audio_top_p=float(top_p),
+            audio_top_k=int(top_k),
+            audio_repetition_penalty=float(repetition_penalty),
+        )
+    messages = processor.decode(outputs)
+    if not messages or messages[0] is None:
+        raise RuntimeError("The model did not return a decodable audio result.")
+    audio = messages[0].audio_codes_list[0]
+    if isinstance(audio, torch.Tensor):
+        audio_np = audio.detach().float().cpu().numpy()
+    else:
+        audio_np = np.asarray(audio, dtype=np.float32)
+    if audio_np.ndim > 1:
+        audio_np = audio_np.reshape(-1)
+    audio_np = audio_np.astype(np.float32, copy=False)
+    elapsed = time.monotonic() - started_at
+    status = (
+        f"Done | mode: {mode_name} | elapsed: {elapsed:.2f}s | "
+        f"max_new_tokens={int(max_new_tokens)}, "
+        f"expected_tokens={expected_tokens if expected_tokens is not None else 'off'}, "
+        f"audio_temperature={float(temperature):.2f}, audio_top_p={float(top_p):.2f}, "
+        f"audio_top_k={int(top_k)}, audio_repetition_penalty={float(repetition_penalty):.2f}"
+    )
+    return (sample_rate, audio_np), status
+def build_demo(args: argparse.Namespace):
+    custom_css = """
+    :root {
+      --bg: #f6f7f8;
+      --panel: #ffffff;
+      --ink: #111418;
+      --muted: #4d5562;
+      --line: #e5e7eb;
+      --accent: #0f766e;
+    }
+    .gradio-container {
+      background: linear-gradient(180deg, #f7f8fa 0%, #f3f5f7 100%);
+      color: var(--ink);
+    }
+    .app-card {
+      border: 1px solid var(--line);
+      border-radius: 16px;
+      background: var(--panel);
+      padding: 14px;
+    }
+    .app-title {
+      font-size: 22px;
+      font-weight: 700;
+      margin-bottom: 6px;
+      letter-spacing: 0.2px;
+    }
+    .app-subtitle {
+      color: var(--muted);
+      font-size: 14px;
+      margin-bottom: 8px;
+    }
+    #output_audio {
+      padding-bottom: 12px;
+      margin-bottom: 8px;
+      overflow: hidden !important;
+    }
+    #output_audio > .wrap {
+      overflow: hidden !important;
+    }
+    #output_audio audio {
+      margin-bottom: 6px;
+    }
+    #run-btn {
+      background: var(--accent);
+      border: none;
+    }
+    """
+    with gr.Blocks(title="MOSS-TTS Demo", css=custom_css) as demo:
+        gr.Markdown(
+            """
+            <div class="app-card">
+              <div class="app-title">MOSS-TTS</div>
+              <div class="app-subtitle">Minimal UI: Direct Generation, Clone, Continuation, Continuation + Clone</div>
+            </div>
+            """
+        )
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=3):
+                text = gr.Textbox(
+                    label="Text",
+                    lines=9,
+                    placeholder="Enter text to synthesize. In continuation modes, prepend the reference audio transcript.",
+                )
+                reference_audio = gr.Audio(
+                    label="Reference Audio (Optional)",
+                    type="filepath",
+                )
+                mode_with_reference = gr.Radio(
+                    choices=[MODE_CLONE, MODE_CONTINUE, MODE_CONTINUE_CLONE],
+                    value=MODE_CLONE,
+                    label="Mode with Reference Audio",
+                    info="If no reference audio is uploaded, Direct Generation will be used automatically.",
+                )
+                mode_hint = gr.Markdown(render_mode_hint(None, MODE_CLONE))
+                duration_control_enabled = gr.Checkbox(
+                    value=False,
+                    label="Enable Duration Control (Expected Audio Tokens)",
+                )
+                duration_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=1,
+                    step=1,
+                    value=1,
+                    label="expected_tokens",
+                    visible=False,
+                )
+                duration_hint = gr.Markdown("Duration control is disabled.")
+                with gr.Accordion("Sampling Parameters (Audio)", open=True):
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=3.0,
+                        step=0.05,
+                        value=1.7,
+                        label="temperature",
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.8,
+                        label="top_p",
+                    )
+                    top_k = gr.Slider(
+                        minimum=1,
+                        maximum=200,
+                        step=1,
+                        value=25,
+                        label="top_k",
+                    )
+                    repetition_penalty = gr.Slider(
+                        minimum=0.8,
+                        maximum=2.0,
+                        step=0.05,
+                        value=1.0,
+                        label="repetition_penalty",
+                    )
+                    max_new_tokens = gr.Slider(
+                        minimum=256,
+                        maximum=8192,
+                        step=128,
+                        value=DEFAULT_MAX_NEW_TOKENS,
+                        label="max_new_tokens",
+                    )
+                run_btn = gr.Button("Generate Speech", variant="primary", elem_id="run-btn")
+            with gr.Column(scale=2):
+                output_audio = gr.Audio(label="Output Audio", type="numpy", elem_id="output_audio")
+                status = gr.Textbox(label="Status", lines=4, interactive=False)
+                examples_table = gr.Dataframe(
+                    headers=["Reference Speech", "Example Text"],
+                    value=[[name, text] for name, _, text in EXAMPLE_ROWS],
+                    datatype=["str", "str"],
+                    row_count=(len(EXAMPLE_ROWS), "fixed"),
+                    col_count=(2, "fixed"),
+                    interactive=False,
+                    wrap=True,
+                    label="Examples (click a row to fill inputs)",
+                )
+        reference_audio.change(
+            fn=render_mode_hint,
+            inputs=[reference_audio, mode_with_reference],
+            outputs=[mode_hint],
+        )
+        mode_with_reference.change(
+            fn=render_mode_hint,
+            inputs=[reference_audio, mode_with_reference],
+            outputs=[mode_hint],
+        )
+        duration_control_enabled.change(
+            fn=update_duration_controls,
+            inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
+            outputs=[duration_tokens, duration_hint, duration_control_enabled],
+        )
+        text.change(
+            fn=update_duration_controls,
+            inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
+            outputs=[duration_tokens, duration_hint, duration_control_enabled],
+        )
+        mode_with_reference.change(
+            fn=update_duration_controls,
+            inputs=[duration_control_enabled, text, duration_tokens, mode_with_reference],
+            outputs=[duration_tokens, duration_hint, duration_control_enabled],
+        )
+        examples_table.select(
+            fn=apply_example_selection,
+            inputs=[mode_with_reference, duration_control_enabled, duration_tokens],
+            outputs=[
+                reference_audio,
+                text,
+                mode_hint,
+                duration_tokens,
+                duration_hint,
+                duration_control_enabled,
+            ],
+        )
+        run_btn.click(
+            fn=lambda text, reference_audio, mode_with_reference, duration_control_enabled, duration_tokens, temperature, top_p, top_k, repetition_penalty, max_new_tokens: run_inference(
+                text=text,
+                reference_audio=reference_audio,
+                mode_with_reference=mode_with_reference,
+                duration_control_enabled=duration_control_enabled,
+                duration_tokens=duration_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                model_path=args.model_path,
+                device=args.device,
+                attn_implementation=args.attn_implementation,
+                max_new_tokens=max_new_tokens,
+            ),
+            inputs=[
+                text,
+                reference_audio,
+                mode_with_reference,
+                duration_control_enabled,
+                duration_tokens,
+                temperature,
+                top_p,
+                top_k,
+                repetition_penalty,
+                max_new_tokens,
+            ],
+            outputs=[output_audio, status],
+        )
+    return demo
+def main():
+    parser = argparse.ArgumentParser(description="MossTTS Gradio Demo")
+    parser.add_argument("--model_path", type=str, default=MODEL_PATH)
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--attn_implementation", type=str, default=DEFAULT_ATTN_IMPLEMENTATION)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7860)
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
+    runtime_device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    runtime_dtype = torch.bfloat16 if runtime_device.type == "cuda" else torch.float32
+    args.attn_implementation = resolve_attn_implementation(
+        requested=args.attn_implementation,
+        device=runtime_device,
+        dtype=runtime_dtype,
+    ) or "none"
+    print(f"[INFO] Using attn_implementation={args.attn_implementation}", flush=True)
+    # Preload model/processor at startup to avoid first-request cold start latency.
+    preload_started_at = time.monotonic()
+    print(
+        f"[Startup] Preloading backend: model={args.model_path}, device={args.device}, attn={args.attn_implementation}",
+        flush=True,
+    )
+    load_backend(
+        model_path=args.model_path,
+        device_str=args.device,
+        attn_implementation=args.attn_implementation,
+    )
+    print(
+        f"[Startup] Backend preload finished in {time.monotonic() - preload_started_at:.2f}s",
+        flush=True,
+    )
+    demo = build_demo(args)
+    demo.queue(max_size=16, default_concurrency_limit=1).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share,
+    )
+if __name__ == "__main__":
+    main()

assets/audio/audio/reference_en_0.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:488e0e6bb4e48a7eb861f8fc7763565587287cde152cbec141b952089b02b2ef
+size 90594

assets/audio/audio/reference_en_1.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82e4fad862ccb12a4ac609623fe6c275d167f7dcfc7866ca740f38ab169935c6
+size 213836

assets/audio/audio/reference_en_2.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:582f8e1e3f3792d7b495e159c29a55bb95c4c46e90725c62807b4b12bf341603
+size 322923

assets/audio/audio/reference_en_3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bd9c6ffb765fda23297fae21725bc174a3092d9687c3606f11d00ae0df9fc1e
+size 107943

assets/audio/audio/reference_zh_0.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5112b5e2bef2a727534af85da1e56048a5ab5552de7aa7cbb5f48b0fa4f5eec
+size 448172

assets/audio/audio/reference_zh_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ff19c55d55a37dbbd550e6624a2faf6cfa7fd56a9594456b17fbe3838b2245
+size 1480128

assets/audio/audio/reference_zh_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1686d3e2b1fe2f6b079cf6a41a9cd9ba31c8f9d3cfe03ff411dd0359641c0c8
+size 505586

assets/audio/audio/reference_zh_3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cffa7c5d91c28895caf51c38418af9651c82a4e16a8e4c04e10991bf80cc04cc
+size 347949

assets/text/text/moss_tts_example_texts.jsonl ADDED Viewed

	@@ -0,0 +1,8 @@

+{"id":"zh/0","language":"zh","role":"可爱的小女孩","text":"亲爱的你，\n你好呀。\n今天，我想用最认真、最温柔的声音，对你说一些重要的话。\n这些话，像一颗小小的星星，希望能在你的心里慢慢发光。"}
+{"id":"zh/1","language":"zh","role":"吴俊全老师","text":"从1948年9月12日至1949年1月31日，连续组织了震惊世界的辽沈、淮海、平津三个大战役，这一百四十二个昼夜中，双方统帅部和各级指挥部所拍发的电码讯号错综交汇，织成一面无形的网，从大气层覆盖下来，于是便注定了中国的山川将会怎样排列，流云又当如何变幻。"}
+{"id":"zh/2","language":"zh","role":"原神胡桃","text":"嘿——你在听吗？\n嗯，不说话也没关系啦，反正我已经习惯自言自语了。\n我是胡桃，往生堂第七十七代堂主。\n别紧张别紧张，我今天不是来“请你喝茶”的——至少现在还不是。\n很多人一听到“往生堂”，就皱起眉头，好像我一开口，空气都要凉三分。\n可你看啊，太阳每天都会落下，可谁会因此不喜欢黄昏呢？\n生与死也是一样的道理嘛——\n不是终点，而是换一条路走走。"}
+{"id":"zh/3","language":"zh","role":"明星杨幂","text":"有些人喜欢被照顾，\n而我更习惯照亮自己。\n\n不是不需要依靠，\n只是明白——\n真正能陪你走到最后的，\n从来都不是运气。\n\n我见过凌晨四点的城市，\n也见过掌声散去后的安静。\n那些看起来毫不费力的从容，\n其实都藏着一次次咬牙坚持。"}
+{"id":"en/0","language":"en","role":"Taylor Swift","text":"Tonight, I just want to take a second and breathe this in with you.\nBecause moments like this don’t happen by accident. They’re built—one lyric at a time, one late night at a time, one brave decision at a time. They’re built by people who keep showing up, even when life is loud, even when the world is heavy, even when they’re not sure anyone sees the effort they’re making."}
+{"id":"en/1","language":"en","role":"Iron Man","text":"Look, I know what you’re thinking. Here he goes again. The guy in the metal suit, the walking ego with a repulsor problem, about to make a speech like it’s a press conference and I’m getting paid by the syllable. Relax. This one isn’t for the cameras. No sponsors, no applause, no clever angle that makes me look taller than I already am."}
+{"id":"en/2","language":"en","role":"David Attenborough","text":"In the quiet hours before dawn, the world looks unfinished. Streets are empty, windows are dark, and the air holds its breath as if waiting for a cue. But beneath the stillness, everything is moving. Water is traveling through pipes. Electricity is humming along invisible lines. Seeds are pushing against soil. Somewhere, a hand reaches for a switch, and a day begins."}
+{"id":"en/3","language":"en","role":"Rick Sanchez","text":"Look, you keep staring at the sky like it’s a customer service desk, waiting for the universe to hand you a receipt that says your pain was “worth it.” Newsflash: the cosmos doesn’t do refunds, it does entropy. It does random collisions of atoms that occasionally arrange themselves into a biped with anxiety and a subscription to self-importance."}

assets/text/text/moss_voice_generator_example_texts.jsonl ADDED Viewed

	@@ -0,0 +1,8 @@

+{"id":"zh/0","language":"zh","instruction":"撕心裂肺，声泪俱下的中年女性","text":"皇上，臣妾做不到啊！皇上，您就杀了臣妾吧！"}
+{"id":"zh/1","language":"zh","instruction":"年轻女性，开头傲慢不屑，发现对方身份后秒怂，疯狂道歉，惊慌失措","text":"你谁啊，关你什么事？啊…王总，您好您好，我不知道是您……"}
+{"id":"zh/2","language":"zh","instruction":"疲惫沙哑的老年声音缓慢抱怨，带有轻微呻吟。","text":"哎呀，我的老腰啊，这年纪大了就是不行了。"}
+{"id":"zh/3","language":"zh","instruction":"粗犷急躁的海盗船长，语速快，语调低沉而充满命令，带着一股不容置疑的霸道。","text":"快点！把那箱金币搬过来！速度快点！别磨磨蹭蹭的！我们必须在涨潮之前离开这里，否则就来不及了！"}
+{"id":"en/0","language":"en","instruction":"Mom scolding kid for breaking a vase, then seeing he cut himself, shifting to concern","text":"How many times have I told you not to run in the house?! You could have…… oh honey, you're bleeding! Let me see your hand…… It's okay, baby."}
+{"id":"en/1","language":"en","instruction":"An elderly female voice, slightly nasal and soft, speaking in a frail, polite British tone, conveying subtle discomfort with gentle hesitation.","text":"Achoo! Oh dear, I do believe I'm catching a cold. This dreadful weather is just too much."}
+{"id":"en/2","language":"en","instruction":"Little girl, innocent and curious, high-pitched and adorable","text":"Mommy, why is the sky blue? And why do birds fly? And why-"}
+{"id":"en/3","language":"en","instruction":"Emotional pop ballad with smooth, melodic delivery, slow tempo with gentle vibrato on sustained notes, conveying hope and vulnerability.","text":"Walking down this empty street tonight, searching for a guiding light, stars above shine oh so bright, everything will be alright"}