Improve model card: add metadata and paper links
#1
by
nielsr HF Staff - opened
README.md
CHANGED
|
@@ -1,242 +1,123 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
<div align="center">
|
| 18 |
-
<a href="https://github.com/OpenMOSS/MOSS-TTS/tree/main"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue"></a>
|
| 19 |
-
<a href="https://modelscope.cn/collections/OpenMOSS-Team/MOSS-TTS"><img src="https://img.shields.io/badge/ModelScope-Models-lightgrey?logo=modelscope&"></a>
|
| 20 |
-
<a href="https://mosi.cn/#models"><img src="https://img.shields.io/badge/Blog-View-blue?logo=internet-explorer&"></a>
|
| 21 |
-
<a href="https://
|
| 22 |
-
|
| 23 |
-
<a href="https://
|
| 24 |
-
<a href="https://
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
MOSS
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
#### (Optional) Install FlashAttention 2
|
| 125 |
-
|
| 126 |
-
For better speed and lower GPU memory usage, you can install FlashAttention 2 if your hardware supports it.
|
| 127 |
-
|
| 128 |
-
```bash
|
| 129 |
-
pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[flash-attn]"
|
| 130 |
-
```
|
| 131 |
-
|
| 132 |
-
If your machine has limited RAM and many CPU cores, you can cap build parallelism:
|
| 133 |
-
|
| 134 |
-
```bash
|
| 135 |
-
MAX_JOBS=4 pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[flash-attn]"
|
| 136 |
-
```
|
| 137 |
-
|
| 138 |
-
Notes:
|
| 139 |
-
- Dependencies are managed in `pyproject.toml`, which currently pins `torch==2.9.1+cu128` and `torchaudio==2.9.1+cu128`.
|
| 140 |
-
- If FlashAttention 2 fails to build on your machine, you can skip it and use the default attention backend.
|
| 141 |
-
- FlashAttention 2 is only available on supported GPUs and is typically used with `torch.float16` or `torch.bfloat16`.
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
### Basic Usage
|
| 145 |
-
|
| 146 |
-
```python
|
| 147 |
-
from pathlib import Path
|
| 148 |
-
import importlib.util
|
| 149 |
-
import torch
|
| 150 |
-
import torchaudio
|
| 151 |
-
from transformers import AutoModel, AutoProcessor
|
| 152 |
-
# Disable the broken cuDNN SDPA backend
|
| 153 |
-
torch.backends.cuda.enable_cudnn_sdp(False)
|
| 154 |
-
# Keep these enabled as fallbacks
|
| 155 |
-
torch.backends.cuda.enable_flash_sdp(True)
|
| 156 |
-
torch.backends.cuda.enable_mem_efficient_sdp(True)
|
| 157 |
-
torch.backends.cuda.enable_math_sdp(True)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-SoundEffect"
|
| 161 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 162 |
-
dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
| 163 |
-
|
| 164 |
-
def resolve_attn_implementation() -> str:
|
| 165 |
-
# Prefer FlashAttention 2 when package + device conditions are met.
|
| 166 |
-
if (
|
| 167 |
-
device == "cuda"
|
| 168 |
-
and importlib.util.find_spec("flash_attn") is not None
|
| 169 |
-
and dtype in {torch.float16, torch.bfloat16}
|
| 170 |
-
):
|
| 171 |
-
major, _ = torch.cuda.get_device_capability()
|
| 172 |
-
if major >= 8:
|
| 173 |
-
return "flash_attention_2"
|
| 174 |
-
|
| 175 |
-
# CUDA fallback: use PyTorch SDPA kernels.
|
| 176 |
-
if device == "cuda":
|
| 177 |
-
return "sdpa"
|
| 178 |
-
|
| 179 |
-
# CPU fallback.
|
| 180 |
-
return "eager"
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
attn_implementation = resolve_attn_implementation()
|
| 184 |
-
print(f"[INFO] Using attn_implementation={attn_implementation}")
|
| 185 |
-
|
| 186 |
-
processor = AutoProcessor.from_pretrained(
|
| 187 |
-
pretrained_model_name_or_path,
|
| 188 |
-
trust_remote_code=True,
|
| 189 |
-
)
|
| 190 |
-
processor.audio_tokenizer = processor.audio_tokenizer.to(device)
|
| 191 |
-
|
| 192 |
-
text_1 = "雷声隆隆,雨声淅沥。"
|
| 193 |
-
text_2 = "清晰脚步声在水泥地面回响,节奏稳定。"
|
| 194 |
-
|
| 195 |
-
conversations = [
|
| 196 |
-
[processor.build_user_message(ambient_sound=text_1)],
|
| 197 |
-
[processor.build_user_message(ambient_sound=text_2)]
|
| 198 |
-
]
|
| 199 |
-
|
| 200 |
-
model = AutoModel.from_pretrained(
|
| 201 |
-
pretrained_model_name_or_path,
|
| 202 |
-
trust_remote_code=True,
|
| 203 |
-
# If FlashAttention 2 is installed, you can set attn_implementation="flash_attention_2"
|
| 204 |
-
attn_implementation=attn_implementation,
|
| 205 |
-
torch_dtype=dtype,
|
| 206 |
-
).to(device)
|
| 207 |
-
model.eval()
|
| 208 |
-
|
| 209 |
-
batch_size = 1
|
| 210 |
-
|
| 211 |
-
save_dir = Path("inference_root")
|
| 212 |
-
save_dir.mkdir(exist_ok=True, parents=True)
|
| 213 |
-
sample_idx = 0
|
| 214 |
-
with torch.no_grad():
|
| 215 |
-
for start in range(0, len(conversations), batch_size):
|
| 216 |
-
batch_conversations = conversations[start : start + batch_size]
|
| 217 |
-
batch = processor(batch_conversations, mode="generation")
|
| 218 |
-
input_ids = batch["input_ids"].to(device)
|
| 219 |
-
attention_mask = batch["attention_mask"].to(device)
|
| 220 |
-
|
| 221 |
-
outputs = model.generate(
|
| 222 |
-
input_ids=input_ids,
|
| 223 |
-
attention_mask=attention_mask,
|
| 224 |
-
max_new_tokens=4096,
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
for message in processor.decode(outputs):
|
| 228 |
-
audio = message.audio_codes_list[0]
|
| 229 |
-
out_path = save_dir / f"sample{sample_idx}.wav"
|
| 230 |
-
sample_idx += 1
|
| 231 |
-
torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
|
| 232 |
-
|
| 233 |
-
```
|
| 234 |
-
|
| 235 |
-
### Input Types
|
| 236 |
-
|
| 237 |
-
**UserMessage**
|
| 238 |
-
| Field | Type | Required | Description |
|
| 239 |
-
|---|---|---:|---|
|
| 240 |
-
| `ambient_sound` | `str` | Yes | Description of environment sound & sound effect |
|
| 241 |
-
| `tokens` | `int` | No | Expected number of audio tokens. **1s ≈ 12.5 tokens**. |
|
| 242 |
-
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-to-audio
|
| 4 |
+
library_name: transformers
|
| 5 |
+
tags:
|
| 6 |
+
- text-to-audio
|
| 7 |
+
- audio-generation
|
| 8 |
+
- moss-tts
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# MOSS-SoundEffect
|
| 12 |
+
|
| 13 |
+
MOSS-SoundEffect is a high-fidelity text-to-sound model from the **MOSS-TTS Family**, developed by the [OpenMOSS team](https://www.open-moss.com/) and [MOSI.AI](https://mosi.cn/#hero). It generates ambient soundscapes and concrete sound effects directly from text descriptions.
|
| 14 |
+
|
| 15 |
+
The model architecture and underlying tokenization are presented in the paper: **[MOSS-Audio-Tokenizer: Scaling Audio Tokenizers for Future Audio Foundation Models](https://huggingface.co/papers/2602.10934)**.
|
| 16 |
+
|
| 17 |
+
<div align="center\">
|
| 18 |
+
<a href="https://github.com/OpenMOSS/MOSS-TTS/tree/main"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue"></a>
|
| 19 |
+
<a href="https://modelscope.cn/collections/OpenMOSS-Team/MOSS-TTS"><img src="https://img.shields.io/badge/ModelScope-Models-lightgrey?logo=modelscope&"></a>
|
| 20 |
+
<a href="https://mosi.cn/#models"><img src="https://img.shields.io/badge/Blog-View-blue?logo=internet-explorer&"></a>
|
| 21 |
+
<a href="https://huggingface.co/papers/2602.10934"><img src="https://img.shields.io/badge/Arxiv-2602.10934-red?logo=arxiv&"></a>
|
| 22 |
+
<a href="https://studio.mosi.cn"><img src="https://img.shields.io/badge/AIStudio-Try-green?logo=internet-explorer&"></a>
|
| 23 |
+
<a href="https://x.com/Open_MOSS"><img src="https://img.shields.io/badge/Twitter-Follow-black?logo=x&"></a>
|
| 24 |
+
<a href="https://discord.gg/fvm5TaWjU3"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&"></a>
|
| 25 |
+
</div>
|
| 26 |
+
|
| 27 |
+
## Overview
|
| 28 |
+
MOSS‑TTS Family is an open‑source **speech and sound generation model family**. It is designed for **high‑fidelity**, **high‑expressiveness**, and **complex real‑world scenarios**, covering stable long‑form speech, multi‑speaker dialogue, voice/character design, environmental sound effects, and real‑time streaming TTS.
|
| 29 |
+
|
| 30 |
+
**MOSS-SoundEffect** specifically focuses on **contextual audio completion** beyond speech, enabling creators and systems to enrich scenes with believable acoustic environments and action‑level cues.
|
| 31 |
+
|
| 32 |
+
### Key Capabilities
|
| 33 |
+
- **Natural environments**: e.g., “fresh snow crunching under footsteps.”
|
| 34 |
+
- **Urban environments**: e.g., “a sports car roaring past on the highway.”
|
| 35 |
+
- **Animals & creatures**: e.g., “early morning park with birds chirping in a quiet atmosphere.”
|
| 36 |
+
- **Human actions**: e.g., “clear footsteps echoing on concrete at a steady rhythm.”
|
| 37 |
+
|
| 38 |
+
## Model Architecture
|
| 39 |
+
MOSS-SoundEffect employs the **MossTTSDelay** architecture, reusing the same discrete token generation backbone for audio synthesis. A text prompt (optionally with simple control tags such as **duration**) is tokenized and fed into the Delay-pattern autoregressive model to predict **RVQ audio tokens** over time. The generated tokens are then decoded by the CAT (Causal Audio Tokenizer) decoder to produce high-fidelity sound effects.
|
| 40 |
+
|
| 41 |
+
## Quick Start
|
| 42 |
+
|
| 43 |
+
### Environment Setup
|
| 44 |
+
|
| 45 |
+
We recommend a clean, isolated Python environment with **Transformers 5.0.0** to avoid dependency conflicts.
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
conda create -n moss-tts python=3.12 -y
|
| 49 |
+
conda activate moss-tts
|
| 50 |
+
|
| 51 |
+
git clone https://github.com/OpenMOSS/MOSS-TTS.git
|
| 52 |
+
cd MOSS-TTS
|
| 53 |
+
pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e .
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Basic Usage
|
| 57 |
+
|
| 58 |
+
```python
|
| 59 |
+
from pathlib import Path
|
| 60 |
+
import importlib.util
|
| 61 |
+
import torch
|
| 62 |
+
import torchaudio
|
| 63 |
+
from transformers import AutoModel, AutoProcessor
|
| 64 |
+
|
| 65 |
+
# Disable the broken cuDNN SDPA backend
|
| 66 |
+
torch.backends.cuda.enable_cudnn_sdp(False)
|
| 67 |
+
# Keep these enabled as fallbacks
|
| 68 |
+
torch.backends.cuda.enable_flash_sdp(True)
|
| 69 |
+
torch.backends.cuda.enable_mem_efficient_sdp(True)
|
| 70 |
+
torch.backends.cuda.enable_math_sdp(True)
|
| 71 |
+
|
| 72 |
+
pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-SoundEffect"
|
| 73 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 74 |
+
dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
| 75 |
+
|
| 76 |
+
processor = AutoProcessor.from_pretrained(
|
| 77 |
+
pretrained_model_name_or_path,
|
| 78 |
+
trust_remote_code=True,
|
| 79 |
+
)
|
| 80 |
+
processor.audio_tokenizer = processor.audio_tokenizer.to(device)
|
| 81 |
+
|
| 82 |
+
text = "雷声隆隆,雨声淅沥。" # Thunder rumbling, rain pattering.
|
| 83 |
+
|
| 84 |
+
conversations = [
|
| 85 |
+
[processor.build_user_message(ambient_sound=text)]
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
model = AutoModel.from_pretrained(
|
| 89 |
+
pretrained_model_name_or_path,
|
| 90 |
+
trust_remote_code=True,
|
| 91 |
+
torch_dtype=dtype,
|
| 92 |
+
).to(device)
|
| 93 |
+
model.eval()
|
| 94 |
+
|
| 95 |
+
with torch.no_grad():
|
| 96 |
+
batch = processor(conversations, mode="generation")
|
| 97 |
+
input_ids = batch["input_ids"].to(device)
|
| 98 |
+
attention_mask = batch["attention_mask"].to(device)
|
| 99 |
+
|
| 100 |
+
outputs = model.generate(
|
| 101 |
+
input_ids=input_ids,
|
| 102 |
+
attention_mask=attention_mask,
|
| 103 |
+
max_new_tokens=4096,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
for message in processor.decode(outputs):
|
| 107 |
+
audio = message.audio_codes_list[0]
|
| 108 |
+
torchaudio.save("sample.wav", audio.unsqueeze(0), processor.model_config.sampling_rate)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Citation
|
| 112 |
+
If you use this model or the CAT architecture in your work, please cite:
|
| 113 |
+
```bibtex
|
| 114 |
+
@misc{gong2026mossaudiotokenizerscalingaudiotokenizers,
|
| 115 |
+
title={MOSS-Audio-Tokenizer: Scaling Audio Tokenizers for Future Audio Foundation Models},
|
| 116 |
+
author={Yitian Gong and Kuangwei Chen and Zhaoye Fei and Xiaogui Yang and Ke Chen and Yang Wang and Kexin Huang and Mingshu Chen and Ruixiao Li and Qingyuan Cheng and Shimin Li and Xipeng Qiu},
|
| 117 |
+
year={2026},
|
| 118 |
+
eprint={2602.10934},
|
| 119 |
+
archivePrefix={arXiv},
|
| 120 |
+
primaryClass={cs.SD},
|
| 121 |
+
url={https://arxiv.org/abs/2602.10934},
|
| 122 |
+
}
|
| 123 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|