rulerman commited on
Commit
a61d33a
·
verified ·
1 Parent(s): 15d2e10
Files changed (2) hide show
  1. README.md +6 -3
  2. processing_moss_tts.py +3 -1
README.md CHANGED
@@ -89,6 +89,7 @@ MOSS-TTSD uses a **continuation** workflow: provide reference audio for each spe
89
  import os
90
  from pathlib import Path
91
  import torch
 
92
  import torchaudio
93
  from transformers import AutoModel, AutoProcessor
94
 
@@ -125,8 +126,10 @@ text_to_generate = "[S1] Listen, let's talk business. China. I'm hearing things.
125
  # --- Load & resample audio ---
126
 
127
  target_sr = int(processor.model_config.sampling_rate)
128
- wav1, sr1 = torchaudio.load(prompt_audio_speaker1)
129
- wav2, sr2 = torchaudio.load(prompt_audio_speaker2)
 
 
130
 
131
  if wav1.shape[0] > 1:
132
  wav1 = wav1.mean(dim=0, keepdim=True)
@@ -255,4 +258,4 @@ For open-source models, annotators are asked to score each sample pair in terms
255
 
256
  For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
257
  ![alt text](assets/VS_Proprietary_Models1.png)
258
- ![alt text](assets/VS_Proprietary_Models2.png)
 
89
  import os
90
  from pathlib import Path
91
  import torch
92
+ import soundfile as sf
93
  import torchaudio
94
  from transformers import AutoModel, AutoProcessor
95
 
 
126
  # --- Load & resample audio ---
127
 
128
  target_sr = int(processor.model_config.sampling_rate)
129
+ audio1, sr1 = sf.read(prompt_audio_speaker1, dtype="float32", always_2d=True)
130
+ audio2, sr2 = sf.read(prompt_audio_speaker2, dtype="float32", always_2d=True)
131
+ wav1 = torch.from_numpy(audio1).transpose(0, 1).contiguous()
132
+ wav2 = torch.from_numpy(audio2).transpose(0, 1).contiguous()
133
 
134
  if wav1.shape[0] > 1:
135
  wav1 = wav1.mean(dim=0, keepdim=True)
 
258
 
259
  For closed-source models, annotators are only asked to choose the overall preferred one in each pair, and we compute the win rate accordingly.
260
  ![alt text](assets/VS_Proprietary_Models1.png)
261
+ ![alt text](assets/VS_Proprietary_Models2.png)
processing_moss_tts.py CHANGED
@@ -18,6 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union, Literal, Final
18
  from dataclasses import dataclass
19
  from pathlib import Path
20
  import re
 
21
  import torchaudio
22
 
23
  from transformers import processing_utils
@@ -896,7 +897,8 @@ class MossTTSDelayProcessor(ProcessorMixin):
896
  target_sr = int(self.model_config.sampling_rate)
897
  wav_list: List[torch.Tensor] = []
898
  for wav_path in wav_path_list:
899
- wav, sr = torchaudio.load(wav_path)
 
900
  if int(sr) != target_sr:
901
  wav = torchaudio.functional.resample(
902
  waveform=wav,
 
18
  from dataclasses import dataclass
19
  from pathlib import Path
20
  import re
21
+ import soundfile as sf
22
  import torchaudio
23
 
24
  from transformers import processing_utils
 
897
  target_sr = int(self.model_config.sampling_rate)
898
  wav_list: List[torch.Tensor] = []
899
  for wav_path in wav_path_list:
900
+ audio, sr = sf.read(wav_path, dtype="float32", always_2d=True)
901
+ wav = torch.from_numpy(audio).transpose(0, 1).contiguous()
902
  if int(sr) != target_sr:
903
  wav = torchaudio.functional.resample(
904
  waveform=wav,