update
Browse files- README.md +6 -3
- processing_moss_tts.py +3 -1
README.md
CHANGED
|
@@ -89,6 +89,7 @@ MOSS-TTSD uses a **continuation** workflow: provide reference audio for each spe
|
|
| 89 |
import os
|
| 90 |
from pathlib import Path
|
| 91 |
import torch
|
|
|
|
| 92 |
import torchaudio
|
| 93 |
from transformers import AutoModel, AutoProcessor
|
| 94 |
|
|
@@ -125,8 +126,10 @@ text_to_generate = "[S1] Listen, let's talk business. China. I'm hearing things.
|
|
| 125 |
# --- Load & resample audio ---
|
| 126 |
|
| 127 |
target_sr = int(processor.model_config.sampling_rate)
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
|
| 131 |
if wav1.shape[0] > 1:
|
| 132 |
wav1 = wav1.mean(dim=0, keepdim=True)
|
|
@@ -255,4 +258,4 @@ For open-source models, annotators are asked to score each sample pair in terms
|
|
| 255 |
|
| 256 |
For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
|
| 257 |

|
| 258 |
-

|
|
|
|
| 89 |
import os
|
| 90 |
from pathlib import Path
|
| 91 |
import torch
|
| 92 |
+
import soundfile as sf
|
| 93 |
import torchaudio
|
| 94 |
from transformers import AutoModel, AutoProcessor
|
| 95 |
|
|
|
|
| 126 |
# --- Load & resample audio ---
|
| 127 |
|
| 128 |
target_sr = int(processor.model_config.sampling_rate)
|
| 129 |
+
audio1, sr1 = sf.read(prompt_audio_speaker1, dtype="float32", always_2d=True)
|
| 130 |
+
audio2, sr2 = sf.read(prompt_audio_speaker2, dtype="float32", always_2d=True)
|
| 131 |
+
wav1 = torch.from_numpy(audio1).transpose(0, 1).contiguous()
|
| 132 |
+
wav2 = torch.from_numpy(audio2).transpose(0, 1).contiguous()
|
| 133 |
|
| 134 |
if wav1.shape[0] > 1:
|
| 135 |
wav1 = wav1.mean(dim=0, keepdim=True)
|
|
|
|
| 258 |
|
| 259 |
For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
|
| 260 |

|
| 261 |
+

|
processing_moss_tts.py
CHANGED
|
@@ -18,6 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union, Literal, Final
|
|
| 18 |
from dataclasses import dataclass
|
| 19 |
from pathlib import Path
|
| 20 |
import re
|
|
|
|
| 21 |
import torchaudio
|
| 22 |
|
| 23 |
from transformers import processing_utils
|
|
@@ -896,7 +897,8 @@ class MossTTSDelayProcessor(ProcessorMixin):
|
|
| 896 |
target_sr = int(self.model_config.sampling_rate)
|
| 897 |
wav_list: List[torch.Tensor] = []
|
| 898 |
for wav_path in wav_path_list:
|
| 899 |
-
|
|
|
|
| 900 |
if int(sr) != target_sr:
|
| 901 |
wav = torchaudio.functional.resample(
|
| 902 |
waveform=wav,
|
|
|
|
| 18 |
from dataclasses import dataclass
|
| 19 |
from pathlib import Path
|
| 20 |
import re
|
| 21 |
+
import soundfile as sf
|
| 22 |
import torchaudio
|
| 23 |
|
| 24 |
from transformers import processing_utils
|
|
|
|
| 897 |
target_sr = int(self.model_config.sampling_rate)
|
| 898 |
wav_list: List[torch.Tensor] = []
|
| 899 |
for wav_path in wav_path_list:
|
| 900 |
+
audio, sr = sf.read(wav_path, dtype="float32", always_2d=True)
|
| 901 |
+
wav = torch.from_numpy(audio).transpose(0, 1).contiguous()
|
| 902 |
if int(sr) != target_sr:
|
| 903 |
wav = torchaudio.functional.resample(
|
| 904 |
waveform=wav,
|