ChuxiJ commited on
Commit
ac82e04
·
1 Parent(s): ec6b572

fix cover load src audio using ffmpeg

Browse files
Files changed (2) hide show
  1. acestep/audio_utils.py +48 -2
  2. acestep/handler.py +48 -4
acestep/audio_utils.py CHANGED
@@ -128,6 +128,52 @@ class AudioSaver:
128
  logger.error(f"[AudioSaver] Failed to save audio: {e}")
129
  raise
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def convert_audio(
132
  self,
133
  input_path: Union[str, Path],
@@ -153,8 +199,8 @@ class AudioSaver:
153
  if not input_path.exists():
154
  raise FileNotFoundError(f"Input file not found: {input_path}")
155
 
156
- # Load audio
157
- audio_tensor, sample_rate = torchaudio.load(str(input_path))
158
 
159
  # Save as new format
160
  output_path = self.save_audio(
 
128
  logger.error(f"[AudioSaver] Failed to save audio: {e}")
129
  raise
130
 
131
+ def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
132
+ """
133
+ Load audio file with fallback backends for compatibility.
134
+
135
+ In HuggingFace Space environment, the default torchcodec backend may fail
136
+ due to missing CUDA dependencies (libnppicc.so.12). This method tries
137
+ ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
138
+
139
+ Args:
140
+ audio_file: Path to the audio file
141
+
142
+ Returns:
143
+ Tuple of (audio_tensor, sample_rate)
144
+
145
+ Raises:
146
+ FileNotFoundError: If the audio file doesn't exist
147
+ Exception: If all backends fail to load the audio
148
+ """
149
+ audio_file = str(audio_file)
150
+
151
+ # Check if file exists first
152
+ if not Path(audio_file).exists():
153
+ raise FileNotFoundError(f"Audio file not found: {audio_file}")
154
+
155
+ # Try ffmpeg backend first (fast and compatible)
156
+ try:
157
+ audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
158
+ return audio, sr
159
+ except Exception as e:
160
+ logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
161
+
162
+ # Try sox backend as second option
163
+ try:
164
+ audio, sr = torchaudio.load(audio_file, backend="sox")
165
+ return audio, sr
166
+ except Exception as e:
167
+ logger.debug(f"[AudioSaver._load_audio_file] sox backend failed: {e}, trying soundfile backend")
168
+
169
+ # Try soundfile backend as last resort
170
+ try:
171
+ audio, sr = torchaudio.load(audio_file, backend="soundfile")
172
+ return audio, sr
173
+ except Exception as e:
174
+ logger.error(f"[AudioSaver._load_audio_file] All backends failed to load audio: {audio_file}")
175
+ raise
176
+
177
  def convert_audio(
178
  self,
179
  input_path: Union[str, Path],
 
199
  if not input_path.exists():
200
  raise FileNotFoundError(f"Input file not found: {input_path}")
201
 
202
+ # Load audio with fallback backends
203
+ audio_tensor, sample_rate = self._load_audio_file(input_path)
204
 
205
  # Save as new format
206
  output_path = self.save_audio(
acestep/handler.py CHANGED
@@ -1062,6 +1062,50 @@ class AceStepHandler:
1062
  instruction = instruction + ":"
1063
  return instruction
1064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1065
  def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
1066
  """
1067
  Normalize audio to stereo 48kHz format.
@@ -1277,8 +1321,8 @@ class AceStepHandler:
1277
  return None
1278
 
1279
  try:
1280
- # Load audio file
1281
- audio, sr = torchaudio.load(audio_file)
1282
 
1283
  logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
1284
  logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
@@ -1332,8 +1376,8 @@ class AceStepHandler:
1332
  return None
1333
 
1334
  try:
1335
- # Load audio file
1336
- audio, sr = torchaudio.load(audio_file)
1337
 
1338
  # Normalize to stereo 48kHz
1339
  audio = self._normalize_audio_to_stereo_48k(audio, sr)
 
1062
  instruction = instruction + ":"
1063
  return instruction
1064
 
1065
+ def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
1066
+ """
1067
+ Load audio file with fallback backends for compatibility.
1068
+
1069
+ In HuggingFace Space environment, the default torchcodec backend may fail
1070
+ due to missing CUDA dependencies (libnppicc.so.12). This method tries
1071
+ ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
1072
+
1073
+ Args:
1074
+ audio_file: Path to the audio file
1075
+
1076
+ Returns:
1077
+ Tuple of (audio_tensor, sample_rate)
1078
+
1079
+ Raises:
1080
+ FileNotFoundError: If the audio file doesn't exist
1081
+ Exception: If all backends fail to load the audio
1082
+ """
1083
+ # Check if file exists first
1084
+ if not os.path.exists(audio_file):
1085
+ raise FileNotFoundError(f"Audio file not found: {audio_file}")
1086
+
1087
+ # Try ffmpeg backend first (fast and compatible)
1088
+ try:
1089
+ audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
1090
+ return audio, sr
1091
+ except Exception as e:
1092
+ logger.debug(f"[_load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
1093
+
1094
+ # Try sox backend as second option
1095
+ try:
1096
+ audio, sr = torchaudio.load(audio_file, backend="sox")
1097
+ return audio, sr
1098
+ except Exception as e:
1099
+ logger.debug(f"[_load_audio_file] sox backend failed: {e}, trying soundfile backend")
1100
+
1101
+ # Try soundfile backend as last resort
1102
+ try:
1103
+ audio, sr = torchaudio.load(audio_file, backend="soundfile")
1104
+ return audio, sr
1105
+ except Exception as e:
1106
+ logger.error(f"[_load_audio_file] All backends failed to load audio: {audio_file}")
1107
+ raise
1108
+
1109
  def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
1110
  """
1111
  Normalize audio to stereo 48kHz format.
 
1321
  return None
1322
 
1323
  try:
1324
+ # Load audio file with fallback backends
1325
+ audio, sr = self._load_audio_file(audio_file)
1326
 
1327
  logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
1328
  logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
 
1376
  return None
1377
 
1378
  try:
1379
+ # Load audio file with fallback backends
1380
+ audio, sr = self._load_audio_file(audio_file)
1381
 
1382
  # Normalize to stereo 48kHz
1383
  audio = self._normalize_audio_to_stereo_48k(audio, sr)