OpenMOSS-Team
/

MOSS-TTSD-v1.0

feature-extraction

Model card Files Files and versions

rulerman commited on 8 days ago

Commit

a61d33a

·

verified ·

1 Parent(s): 15d2e10

update

Files changed (2) hide show

README.md +6 -3
processing_moss_tts.py +3 -1

README.md CHANGED Viewed

@@ -89,6 +89,7 @@ MOSS-TTSD uses a **continuation** workflow: provide reference audio for each spe
 import os
 from pathlib import Path
 import torch
 import torchaudio
 from transformers import AutoModel, AutoProcessor
@@ -125,8 +126,10 @@ text_to_generate = "[S1] Listen, let's talk business. China. I'm hearing things.
 # --- Load & resample audio ---
 target_sr = int(processor.model_config.sampling_rate)
-wav1, sr1 = torchaudio.load(prompt_audio_speaker1)
-wav2, sr2 = torchaudio.load(prompt_audio_speaker2)
 if wav1.shape[0] > 1:
     wav1 = wav1.mean(dim=0, keepdim=True)
@@ -255,4 +258,4 @@ For open-source models, annotators are asked to score each sample pair in terms
 For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
 ![alt text](assets/VS_Proprietary_Models1.png)
-![alt text](assets/VS_Proprietary_Models2.png)

 import os
 from pathlib import Path
 import torch
+import soundfile as sf
 import torchaudio
 from transformers import AutoModel, AutoProcessor
 # --- Load & resample audio ---
 target_sr = int(processor.model_config.sampling_rate)
+audio1, sr1 = sf.read(prompt_audio_speaker1, dtype="float32", always_2d=True)
+audio2, sr2 = sf.read(prompt_audio_speaker2, dtype="float32", always_2d=True)
+wav1 = torch.from_numpy(audio1).transpose(0, 1).contiguous()
+wav2 = torch.from_numpy(audio2).transpose(0, 1).contiguous()
 if wav1.shape[0] > 1:
     wav1 = wav1.mean(dim=0, keepdim=True)
 For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
 ![alt text](assets/VS_Proprietary_Models1.png)
+![alt text](assets/VS_Proprietary_Models2.png)

processing_moss_tts.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union, Literal, Final
 from dataclasses import dataclass
 from pathlib import Path
 import re
 import torchaudio
 from transformers import processing_utils
@@ -896,7 +897,8 @@ class MossTTSDelayProcessor(ProcessorMixin):
         target_sr = int(self.model_config.sampling_rate)
         wav_list: List[torch.Tensor] = []
         for wav_path in wav_path_list:
-            wav, sr = torchaudio.load(wav_path)
             if int(sr) != target_sr:
                 wav = torchaudio.functional.resample(
                     waveform=wav,

 from dataclasses import dataclass
 from pathlib import Path
 import re
+import soundfile as sf
 import torchaudio
 from transformers import processing_utils
         target_sr = int(self.model_config.sampling_rate)
         wav_list: List[torch.Tensor] = []
         for wav_path in wav_path_list:
+            audio, sr = sf.read(wav_path, dtype="float32", always_2d=True)
+            wav = torch.from_numpy(audio).transpose(0, 1).contiguous()
             if int(sr) != target_sr:
                 wav = torchaudio.functional.resample(
                     waveform=wav,