Spaces:
Running
on
A100
Running
on
A100
load audio fallback
Browse files- acestep/audio_utils.py +26 -7
- acestep/handler.py +25 -7
acestep/audio_utils.py
CHANGED
|
@@ -135,11 +135,9 @@ class AudioSaver:
|
|
| 135 |
|
| 136 |
def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
|
| 137 |
"""
|
| 138 |
-
Load audio file
|
| 139 |
|
| 140 |
-
|
| 141 |
-
torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
|
| 142 |
-
This makes torchaudio use ffmpeg backend by default.
|
| 143 |
|
| 144 |
Args:
|
| 145 |
audio_file: Path to the audio file
|
|
@@ -149,6 +147,7 @@ class AudioSaver:
|
|
| 149 |
|
| 150 |
Raises:
|
| 151 |
FileNotFoundError: If the audio file doesn't exist
|
|
|
|
| 152 |
"""
|
| 153 |
audio_file = str(audio_file)
|
| 154 |
|
|
@@ -156,9 +155,29 @@ class AudioSaver:
|
|
| 156 |
if not Path(audio_file).exists():
|
| 157 |
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 158 |
|
| 159 |
-
#
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
def convert_audio(
|
| 164 |
self,
|
|
|
|
| 135 |
|
| 136 |
def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
|
| 137 |
"""
|
| 138 |
+
Load audio file with ffmpeg backend, fallback to soundfile if failed.
|
| 139 |
|
| 140 |
+
This handles CUDA dependency issues with torchcodec on HuggingFace Space.
|
|
|
|
|
|
|
| 141 |
|
| 142 |
Args:
|
| 143 |
audio_file: Path to the audio file
|
|
|
|
| 147 |
|
| 148 |
Raises:
|
| 149 |
FileNotFoundError: If the audio file doesn't exist
|
| 150 |
+
Exception: If all methods fail to load the audio
|
| 151 |
"""
|
| 152 |
audio_file = str(audio_file)
|
| 153 |
|
|
|
|
| 155 |
if not Path(audio_file).exists():
|
| 156 |
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 157 |
|
| 158 |
+
# Try torchaudio with explicit ffmpeg backend first
|
| 159 |
+
try:
|
| 160 |
+
audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
|
| 161 |
+
return audio, sr
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying soundfile fallback")
|
| 164 |
+
|
| 165 |
+
# Fallback: use soundfile directly (most compatible)
|
| 166 |
+
try:
|
| 167 |
+
import soundfile as sf
|
| 168 |
+
audio_np, sr = sf.read(audio_file)
|
| 169 |
+
# soundfile returns [samples, channels] or [samples], convert to [channels, samples]
|
| 170 |
+
audio = torch.from_numpy(audio_np).float()
|
| 171 |
+
if audio.dim() == 1:
|
| 172 |
+
# Mono: [samples] -> [1, samples]
|
| 173 |
+
audio = audio.unsqueeze(0)
|
| 174 |
+
else:
|
| 175 |
+
# Stereo: [samples, channels] -> [channels, samples]
|
| 176 |
+
audio = audio.T
|
| 177 |
+
return audio, sr
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.error(f"[AudioSaver._load_audio_file] All methods failed to load audio: {audio_file}, error: {e}")
|
| 180 |
+
raise
|
| 181 |
|
| 182 |
def convert_audio(
|
| 183 |
self,
|
acestep/handler.py
CHANGED
|
@@ -1068,11 +1068,9 @@ class AceStepHandler:
|
|
| 1068 |
|
| 1069 |
def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
|
| 1070 |
"""
|
| 1071 |
-
Load audio file
|
| 1072 |
|
| 1073 |
-
|
| 1074 |
-
torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
|
| 1075 |
-
This makes torchaudio use ffmpeg backend by default.
|
| 1076 |
|
| 1077 |
Args:
|
| 1078 |
audio_file: Path to the audio file
|
|
@@ -1082,14 +1080,34 @@ class AceStepHandler:
|
|
| 1082 |
|
| 1083 |
Raises:
|
| 1084 |
FileNotFoundError: If the audio file doesn't exist
|
|
|
|
| 1085 |
"""
|
| 1086 |
# Check if file exists first
|
| 1087 |
if not os.path.exists(audio_file):
|
| 1088 |
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 1089 |
|
| 1090 |
-
#
|
| 1091 |
-
|
| 1092 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
|
| 1094 |
def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
|
| 1095 |
"""
|
|
|
|
| 1068 |
|
| 1069 |
def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
|
| 1070 |
"""
|
| 1071 |
+
Load audio file with ffmpeg backend, fallback to soundfile if failed.
|
| 1072 |
|
| 1073 |
+
This handles CUDA dependency issues with torchcodec on HuggingFace Space.
|
|
|
|
|
|
|
| 1074 |
|
| 1075 |
Args:
|
| 1076 |
audio_file: Path to the audio file
|
|
|
|
| 1080 |
|
| 1081 |
Raises:
|
| 1082 |
FileNotFoundError: If the audio file doesn't exist
|
| 1083 |
+
Exception: If all methods fail to load the audio
|
| 1084 |
"""
|
| 1085 |
# Check if file exists first
|
| 1086 |
if not os.path.exists(audio_file):
|
| 1087 |
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
| 1088 |
|
| 1089 |
+
# Try torchaudio with explicit ffmpeg backend first
|
| 1090 |
+
try:
|
| 1091 |
+
audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
|
| 1092 |
+
return audio, sr
|
| 1093 |
+
except Exception as e:
|
| 1094 |
+
logger.debug(f"[_load_audio_file] ffmpeg backend failed: {e}, trying soundfile fallback")
|
| 1095 |
+
|
| 1096 |
+
# Fallback: use soundfile directly (most compatible)
|
| 1097 |
+
try:
|
| 1098 |
+
audio_np, sr = sf.read(audio_file)
|
| 1099 |
+
# soundfile returns [samples, channels] or [samples], convert to [channels, samples]
|
| 1100 |
+
audio = torch.from_numpy(audio_np).float()
|
| 1101 |
+
if audio.dim() == 1:
|
| 1102 |
+
# Mono: [samples] -> [1, samples]
|
| 1103 |
+
audio = audio.unsqueeze(0)
|
| 1104 |
+
else:
|
| 1105 |
+
# Stereo: [samples, channels] -> [channels, samples]
|
| 1106 |
+
audio = audio.T
|
| 1107 |
+
return audio, sr
|
| 1108 |
+
except Exception as e:
|
| 1109 |
+
logger.error(f"[_load_audio_file] All methods failed to load audio: {audio_file}, error: {e}")
|
| 1110 |
+
raise
|
| 1111 |
|
| 1112 |
def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
|
| 1113 |
"""
|