Spaces:
Running
on
A100
Running
on
A100
fix cover load src audio using ffmpeg
Browse files- acestep/audio_utils.py +48 -2
- acestep/handler.py +48 -4
acestep/audio_utils.py
CHANGED
|
@@ -128,6 +128,52 @@ class AudioSaver:
|
|
| 128 |
logger.error(f"[AudioSaver] Failed to save audio: {e}")
|
| 129 |
raise
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
def convert_audio(
|
| 132 |
self,
|
| 133 |
input_path: Union[str, Path],
|
|
@@ -153,8 +199,8 @@ class AudioSaver:
|
|
| 153 |
if not input_path.exists():
|
| 154 |
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 155 |
|
| 156 |
-
# Load audio
|
| 157 |
-
audio_tensor, sample_rate =
|
| 158 |
|
| 159 |
# Save as new format
|
| 160 |
output_path = self.save_audio(
|
|
|
|
| 128 |
logger.error(f"[AudioSaver] Failed to save audio: {e}")
|
| 129 |
raise
|
| 130 |
|
| 131 |
+
def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
|
| 132 |
+
"""
|
| 133 |
+
Load audio file with fallback backends for compatibility.
|
| 134 |
+
|
| 135 |
+
In HuggingFace Space environment, the default torchcodec backend may fail
|
| 136 |
+
due to missing CUDA dependencies (libnppicc.so.12). This method tries
|
| 137 |
+
ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
audio_file: Path to the audio file
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Tuple of (audio_tensor, sample_rate)
|
| 144 |
+
|
| 145 |
+
Raises:
|
| 146 |
+
FileNotFoundError: If the audio file doesn't exist
|
| 147 |
+
Exception: If all backends fail to load the audio
|
| 148 |
+
"""
|
| 149 |
+
audio_file = str(audio_file)
|
| 150 |
+
|
| 151 |
+
# Check if file exists first
|
| 152 |
+
if not Path(audio_file).exists():
|
| 153 |
+
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 154 |
+
|
| 155 |
+
# Try ffmpeg backend first (fast and compatible)
|
| 156 |
+
try:
|
| 157 |
+
audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
|
| 158 |
+
return audio, sr
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
|
| 161 |
+
|
| 162 |
+
# Try sox backend as second option
|
| 163 |
+
try:
|
| 164 |
+
audio, sr = torchaudio.load(audio_file, backend="sox")
|
| 165 |
+
return audio, sr
|
| 166 |
+
except Exception as e:
|
| 167 |
+
logger.debug(f"[AudioSaver._load_audio_file] sox backend failed: {e}, trying soundfile backend")
|
| 168 |
+
|
| 169 |
+
# Try soundfile backend as last resort
|
| 170 |
+
try:
|
| 171 |
+
audio, sr = torchaudio.load(audio_file, backend="soundfile")
|
| 172 |
+
return audio, sr
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"[AudioSaver._load_audio_file] All backends failed to load audio: {audio_file}")
|
| 175 |
+
raise
|
| 176 |
+
|
| 177 |
def convert_audio(
|
| 178 |
self,
|
| 179 |
input_path: Union[str, Path],
|
|
|
|
| 199 |
if not input_path.exists():
|
| 200 |
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 201 |
|
| 202 |
+
# Load audio with fallback backends
|
| 203 |
+
audio_tensor, sample_rate = self._load_audio_file(input_path)
|
| 204 |
|
| 205 |
# Save as new format
|
| 206 |
output_path = self.save_audio(
|
acestep/handler.py
CHANGED
|
@@ -1062,6 +1062,50 @@ class AceStepHandler:
|
|
| 1062 |
instruction = instruction + ":"
|
| 1063 |
return instruction
|
| 1064 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
|
| 1066 |
"""
|
| 1067 |
Normalize audio to stereo 48kHz format.
|
|
@@ -1277,8 +1321,8 @@ class AceStepHandler:
|
|
| 1277 |
return None
|
| 1278 |
|
| 1279 |
try:
|
| 1280 |
-
# Load audio file
|
| 1281 |
-
audio, sr =
|
| 1282 |
|
| 1283 |
logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
|
| 1284 |
logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
|
|
@@ -1332,8 +1376,8 @@ class AceStepHandler:
|
|
| 1332 |
return None
|
| 1333 |
|
| 1334 |
try:
|
| 1335 |
-
# Load audio file
|
| 1336 |
-
audio, sr =
|
| 1337 |
|
| 1338 |
# Normalize to stereo 48kHz
|
| 1339 |
audio = self._normalize_audio_to_stereo_48k(audio, sr)
|
|
|
|
| 1062 |
instruction = instruction + ":"
|
| 1063 |
return instruction
|
| 1064 |
|
| 1065 |
+
def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
|
| 1066 |
+
"""
|
| 1067 |
+
Load audio file with fallback backends for compatibility.
|
| 1068 |
+
|
| 1069 |
+
In HuggingFace Space environment, the default torchcodec backend may fail
|
| 1070 |
+
due to missing CUDA dependencies (libnppicc.so.12). This method tries
|
| 1071 |
+
ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
|
| 1072 |
+
|
| 1073 |
+
Args:
|
| 1074 |
+
audio_file: Path to the audio file
|
| 1075 |
+
|
| 1076 |
+
Returns:
|
| 1077 |
+
Tuple of (audio_tensor, sample_rate)
|
| 1078 |
+
|
| 1079 |
+
Raises:
|
| 1080 |
+
FileNotFoundError: If the audio file doesn't exist
|
| 1081 |
+
Exception: If all backends fail to load the audio
|
| 1082 |
+
"""
|
| 1083 |
+
# Check if file exists first
|
| 1084 |
+
if not os.path.exists(audio_file):
|
| 1085 |
+
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 1086 |
+
|
| 1087 |
+
# Try ffmpeg backend first (fast and compatible)
|
| 1088 |
+
try:
|
| 1089 |
+
audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
|
| 1090 |
+
return audio, sr
|
| 1091 |
+
except Exception as e:
|
| 1092 |
+
logger.debug(f"[_load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
|
| 1093 |
+
|
| 1094 |
+
# Try sox backend as second option
|
| 1095 |
+
try:
|
| 1096 |
+
audio, sr = torchaudio.load(audio_file, backend="sox")
|
| 1097 |
+
return audio, sr
|
| 1098 |
+
except Exception as e:
|
| 1099 |
+
logger.debug(f"[_load_audio_file] sox backend failed: {e}, trying soundfile backend")
|
| 1100 |
+
|
| 1101 |
+
# Try soundfile backend as last resort
|
| 1102 |
+
try:
|
| 1103 |
+
audio, sr = torchaudio.load(audio_file, backend="soundfile")
|
| 1104 |
+
return audio, sr
|
| 1105 |
+
except Exception as e:
|
| 1106 |
+
logger.error(f"[_load_audio_file] All backends failed to load audio: {audio_file}")
|
| 1107 |
+
raise
|
| 1108 |
+
|
| 1109 |
def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
|
| 1110 |
"""
|
| 1111 |
Normalize audio to stereo 48kHz format.
|
|
|
|
| 1321 |
return None
|
| 1322 |
|
| 1323 |
try:
|
| 1324 |
+
# Load audio file with fallback backends
|
| 1325 |
+
audio, sr = self._load_audio_file(audio_file)
|
| 1326 |
|
| 1327 |
logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
|
| 1328 |
logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
|
|
|
|
| 1376 |
return None
|
| 1377 |
|
| 1378 |
try:
|
| 1379 |
+
# Load audio file with fallback backends
|
| 1380 |
+
audio, sr = self._load_audio_file(audio_file)
|
| 1381 |
|
| 1382 |
# Normalize to stereo 48kHz
|
| 1383 |
audio = self._normalize_audio_to_stereo_48k(audio, sr)
|