Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on 21 days ago

Commit

288b94f

1 Parent(s): 96b5f27

load audio fallback

Browse files

Files changed (2) hide show

acestep/audio_utils.py +26 -7
acestep/handler.py +25 -7

acestep/audio_utils.py CHANGED Viewed

@@ -135,11 +135,9 @@ class AudioSaver:
     def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
         """
-        Load audio file using torchaudio.
-        Note: TORCHAUDIO_USE_TORCHCODEC=0 is set at module level to disable
-        torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
-        This makes torchaudio use ffmpeg backend by default.
         Args:
             audio_file: Path to the audio file
@@ -149,6 +147,7 @@ class AudioSaver:
         Raises:
             FileNotFoundError: If the audio file doesn't exist
         """
         audio_file = str(audio_file)
@@ -156,9 +155,29 @@ class AudioSaver:
         if not Path(audio_file).exists():
             raise FileNotFoundError(f"Audio file not found: {audio_file}")
-        # Load audio using default backend (ffmpeg, since torchcodec is disabled)
-        audio, sr = torchaudio.load(audio_file)
-        return audio, sr
     def convert_audio(
         self,

     def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
         """
+        Load audio file with ffmpeg backend, fallback to soundfile if failed.
+        This handles CUDA dependency issues with torchcodec on HuggingFace Space.
         Args:
             audio_file: Path to the audio file
         Raises:
             FileNotFoundError: If the audio file doesn't exist
+            Exception: If all methods fail to load the audio
         """
         audio_file = str(audio_file)
         if not Path(audio_file).exists():
             raise FileNotFoundError(f"Audio file not found: {audio_file}")
+        # Try torchaudio with explicit ffmpeg backend first
+        try:
+            audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
+            return audio, sr
+        except Exception as e:
+            logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying soundfile fallback")
+        # Fallback: use soundfile directly (most compatible)
+        try:
+            import soundfile as sf
+            audio_np, sr = sf.read(audio_file)
+            # soundfile returns [samples, channels] or [samples], convert to [channels, samples]
+            audio = torch.from_numpy(audio_np).float()
+            if audio.dim() == 1:
+                # Mono: [samples] -> [1, samples]
+                audio = audio.unsqueeze(0)
+            else:
+                # Stereo: [samples, channels] -> [channels, samples]
+                audio = audio.T
+            return audio, sr
+        except Exception as e:
+            logger.error(f"[AudioSaver._load_audio_file] All methods failed to load audio: {audio_file}, error: {e}")
+            raise
     def convert_audio(
         self,

acestep/handler.py CHANGED Viewed

@@ -1068,11 +1068,9 @@ class AceStepHandler:
     def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
         """
-        Load audio file using torchaudio.
-        Note: TORCHAUDIO_USE_TORCHCODEC=0 is set at module level to disable
-        torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
-        This makes torchaudio use ffmpeg backend by default.
         Args:
             audio_file: Path to the audio file
@@ -1082,14 +1080,34 @@ class AceStepHandler:
         Raises:
             FileNotFoundError: If the audio file doesn't exist
         """
         # Check if file exists first
         if not os.path.exists(audio_file):
             raise FileNotFoundError(f"Audio file not found: {audio_file}")
-        # Load audio using default backend (ffmpeg, since torchcodec is disabled)
-        audio, sr = torchaudio.load(audio_file)
-        return audio, sr
     def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
         """

     def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
         """
+        Load audio file with ffmpeg backend, fallback to soundfile if failed.
+        This handles CUDA dependency issues with torchcodec on HuggingFace Space.
         Args:
             audio_file: Path to the audio file
         Raises:
             FileNotFoundError: If the audio file doesn't exist
+            Exception: If all methods fail to load the audio
         """
         # Check if file exists first
         if not os.path.exists(audio_file):
             raise FileNotFoundError(f"Audio file not found: {audio_file}")
+        # Try torchaudio with explicit ffmpeg backend first
+        try:
+            audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
+            return audio, sr
+        except Exception as e:
+            logger.debug(f"[_load_audio_file] ffmpeg backend failed: {e}, trying soundfile fallback")
+        # Fallback: use soundfile directly (most compatible)
+        try:
+            audio_np, sr = sf.read(audio_file)
+            # soundfile returns [samples, channels] or [samples], convert to [channels, samples]
+            audio = torch.from_numpy(audio_np).float()
+            if audio.dim() == 1:
+                # Mono: [samples] -> [1, samples]
+                audio = audio.unsqueeze(0)
+            else:
+                # Stereo: [samples, channels] -> [channels, samples]
+                audio = audio.T
+            return audio, sr
+        except Exception as e:
+            logger.error(f"[_load_audio_file] All methods failed to load audio: {audio_file}, error: {e}")
+            raise
     def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
         """