Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on about 12 hours ago

Commit

c60d64b

1 Parent(s): 6d3b89f

switch to zero gpu

Browse files

Files changed (9) hide show

README.md +4 -2
acestep/gradio_ui/events/__init__.py +23 -2
acestep/gradio_ui/events/generation_handlers.py +27 -7
acestep/gradio_ui/events/results_handlers.py +23 -3
acestep/handler.py +70 -28
acestep/inference.py +13 -15
acestep/llm_inference.py +101 -9
app.py +47 -7
requirements.txt +6 -7

README.md CHANGED Viewed

@@ -3,10 +3,12 @@ title: ACE-Step v1.5
 emoji: 🎵
 colorFrom: blue
 colorTo: purple
-sdk: docker
-app_port: 7860
 pinned: false
 license: mit
 short_description: Music Generation Foundation Model v1.5
 ---

 emoji: 🎵
 colorFrom: blue
 colorTo: purple
+sdk: gradio
+sdk_version: 6.2.0
+python_version: 3.11
 pinned: false
 license: mit
+app_file: app.py
 short_description: Music Generation Foundation Model v1.5
 ---

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -2,8 +2,10 @@
 Gradio UI Event Handlers Module
 Main entry point for setting up all event handlers
 """
 import gradio as gr
 from typing import Optional
 # Import handler modules
 from . import generation_handlers as gen_h
@@ -11,6 +13,24 @@ from . import results_handlers as res_h
 from . import training_handlers as train_h
 from acestep.gradio_ui.i18n import t
 def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section, init_params=None):
     """Setup event handlers connecting UI components and business logic
@@ -618,12 +638,13 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             ]
         )
     def generation_wrapper(selected_model, generation_mode, simple_query_input, simple_vocal_language, *args):
         """Wrapper that selects the appropriate DiT handler based on model selection"""
         # Convert args to list for modification
         args_list = list(args)
-        # args order (after simple mode params):
         # captions (0), lyrics (1), bpm (2), key_scale (3), time_signature (4), vocal_language (5),
         # inference_steps (6), guidance_scale (7), random_seed_checkbox (8), seed (9),
         # reference_audio (10), audio_duration (11), batch_size_input (12), src_audio (13),
@@ -684,7 +705,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             # Mark as formatted caption (LM-generated sample)
             args_list[36] = True  # is_format_caption_state
-        # Determine which handler to use
         active_handler = dit_handler  # Default to primary handler
         if dit_handler_2 is not None and selected_model == config_path_2:
             active_handler = dit_handler_2

 Gradio UI Event Handlers Module
 Main entry point for setting up all event handlers
 """
+import os
 import gradio as gr
 from typing import Optional
+from loguru import logger
 # Import handler modules
 from . import generation_handlers as gen_h
 from . import training_handlers as train_h
 from acestep.gradio_ui.i18n import t
+# HuggingFace Space environment detection for ZeroGPU support
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=120):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
 def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section, init_params=None):
     """Setup event handlers connecting UI components and business logic
             ]
         )
+    @_get_spaces_gpu_decorator(duration=300)
     def generation_wrapper(selected_model, generation_mode, simple_query_input, simple_vocal_language, *args):
         """Wrapper that selects the appropriate DiT handler based on model selection"""
         # Convert args to list for modification
         args_list = list(args)
+        # args order (after simple mode params):
         # captions (0), lyrics (1), bpm (2), key_scale (3), time_signature (4), vocal_language (5),
         # inference_steps (6), guidance_scale (7), random_seed_checkbox (8), seed (9),
         # reference_audio (10), audio_duration (11), batch_size_input (12), src_audio (13),
             # Mark as formatted caption (LM-generated sample)
             args_list[36] = True  # is_format_caption_state
+        # Determine which handler to use based on model selection
         active_handler = dit_handler  # Default to primary handler
         if dit_handler_2 is not None and selected_model == config_path_2:
             active_handler = dit_handler_2

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -8,6 +8,7 @@ import random
 import glob
 import gradio as gr
 from typing import Optional, List, Tuple
 from acestep.constants import (
     TASK_TYPES_TURBO,
     TASK_TYPES_BASE,
@@ -16,6 +17,25 @@ from acestep.gradio_ui.i18n import t
 from acestep.inference import understand_music, create_sample, format_sample
 def parse_and_validate_timesteps(
     timesteps_str: str,
     inference_steps: int
@@ -746,15 +766,15 @@ def handle_generation_mode_change(mode: str):
         think_checkbox_update,  # think_checkbox - disabled for cover/repaint modes
     )
 def process_source_audio(dit_handler, llm_handler, src_audio, constrained_decoding_debug):
     """
     Process source audio: convert to codes and then transcribe.
     This combines convert_src_audio_to_codes_wrapper + transcribe_audio_codes.
     Args:
-        dit_handler: DiT handler instance for audio code conversion
-        llm_handler: LLM handler instance for transcription
         src_audio: Path to source audio file
         constrained_decoding_debug: Whether to enable debug logging
@@ -799,7 +819,7 @@ def process_source_audio(dit_handler, llm_handler, src_audio, constrained_decodi
         True  # Set is_format_caption to True
     )
 def handle_create_sample(
     llm_handler,
     query: str,
@@ -819,7 +839,7 @@ def handle_create_sample(
     Note: cfg_scale and negative_prompt are not supported in create_sample mode.
     Args:
-        llm_handler: LLM handler instance
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
         vocal_language: Preferred vocal language for constrained decoding
@@ -929,7 +949,7 @@ def handle_create_sample(
         result.status_message,  # status_output
     )
 def handle_format_sample(
     llm_handler,
     caption: str,
@@ -952,7 +972,7 @@ def handle_format_sample(
     Note: cfg_scale and negative_prompt are not supported in format mode.
     Args:
-        llm_handler: LLM handler instance
         caption: User's caption/description
         lyrics: User's lyrics
         bpm: User-provided BPM (optional, for constrained decoding)

 import glob
 import gradio as gr
 from typing import Optional, List, Tuple
+from loguru import logger
 from acestep.constants import (
     TASK_TYPES_TURBO,
     TASK_TYPES_BASE,
 from acestep.inference import understand_music, create_sample, format_sample
+# HuggingFace Space environment detection for ZeroGPU support
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=120):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
 def parse_and_validate_timesteps(
     timesteps_str: str,
     inference_steps: int
         think_checkbox_update,  # think_checkbox - disabled for cover/repaint modes
     )
+@_get_spaces_gpu_decorator(duration=180)
 def process_source_audio(dit_handler, llm_handler, src_audio, constrained_decoding_debug):
     """
     Process source audio: convert to codes and then transcribe.
     This combines convert_src_audio_to_codes_wrapper + transcribe_audio_codes.
     Args:
+        dit_handler: DiT handler instance
+        llm_handler: LLM handler instance
         src_audio: Path to source audio file
         constrained_decoding_debug: Whether to enable debug logging
         True  # Set is_format_caption to True
     )
+@_get_spaces_gpu_decorator(duration=180)
 def handle_create_sample(
     llm_handler,
     query: str,
     Note: cfg_scale and negative_prompt are not supported in create_sample mode.
     Args:
+        llm_handler: LLM handler instance (unused, fetched from registry)
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
         vocal_language: Preferred vocal language for constrained decoding
         result.status_message,  # status_output
     )
+@_get_spaces_gpu_decorator(duration=180)
 def handle_format_sample(
     llm_handler,
     caption: str,
     Note: cfg_scale and negative_prompt are not supported in format mode.
     Args:
+        llm_handler: LLM handler instance (unused, fetched from registry)
         caption: User's caption/description
         lyrics: User's lyrics
         bpm: User-provided BPM (optional, for constrained decoding)

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -18,6 +18,26 @@ from acestep.gradio_ui.i18n import t
 from acestep.gradio_ui.events.generation_handlers import parse_and_validate_timesteps
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.audio_utils import save_audio
 def parse_lrc_to_subtitles(lrc_text: str, total_duration: Optional[float] = None) -> List[Dict[str, Any]]:
@@ -1038,7 +1058,7 @@ def calculate_score_handler(
         error_msg = t("messages.score_error", error=str(e)) + f"\n{traceback.format_exc()}"
         return error_msg
 def calculate_score_handler_with_selection(
         dit_handler,
         llm_handler,
@@ -1152,7 +1172,7 @@ def calculate_score_handler_with_selection(
         batch_queue
     )
 def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_queue, vocal_language, inference_steps):
     """
     Generate LRC timestamps for a specific audio sample.
@@ -1165,7 +1185,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
     This decouples audio value updates from subtitle updates, avoiding flickering.
     Args:
-        dit_handler: DiT handler instance with get_lyric_timestamp method
         sample_idx: Which sample to generate LRC for (1-8)
         current_batch_index: Current batch index in batch_queue
         batch_queue: Dictionary storing all batch generation data

 from acestep.gradio_ui.events.generation_handlers import parse_and_validate_timesteps
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.audio_utils import save_audio
+from loguru import logger
+# HuggingFace Space environment detection for ZeroGPU support
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=120):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
 def parse_lrc_to_subtitles(lrc_text: str, total_duration: Optional[float] = None) -> List[Dict[str, Any]]:
         error_msg = t("messages.score_error", error=str(e)) + f"\n{traceback.format_exc()}"
         return error_msg
+@_get_spaces_gpu_decorator(duration=240)
 def calculate_score_handler_with_selection(
         dit_handler,
         llm_handler,
         batch_queue
     )
+@_get_spaces_gpu_decorator(duration=240)
 def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_queue, vocal_language, inference_steps):
     """
     Generate LRC timestamps for a specific audio sample.
     This decouples audio value updates from subtitle updates, avoiding flickering.
     Args:
+        dit_handler: DiT handler instance (unused, fetched from registry)
         sample_idx: Which sample to generate LRC for (1-8)
         current_batch_index: Current batch index in batch_queue
         batch_queue: Dictionary storing all batch generation data

acestep/handler.py CHANGED Viewed

@@ -199,14 +199,24 @@ class AceStepHandler:
         return model_path
-    def is_flash_attention_available(self) -> bool:
-        """Check if flash attention is available on the system"""
         try:
-            import flash_attn
             return True
         except ImportError:
             return False
     def is_turbo_model(self) -> bool:
         """Check if the currently loaded model is a turbo model"""
         if self.config is None:
@@ -425,33 +435,38 @@ class AceStepHandler:
                 acestep_v15_checkpoint_path = self._ensure_model_downloaded(config_path, checkpoint_dir)
             if os.path.exists(acestep_v15_checkpoint_path):
-                # Determine attention implementation
-                if use_flash_attention and self.is_flash_attention_available():
-                    attn_implementation = "flash_attention_2"
                     self.dtype = torch.bfloat16
                 else:
                     attn_implementation = "sdpa"
-                try:
-                    logger.info(f"[initialize_service] Attempting to load model with attention implementation: {attn_implementation}")
-                    self.model = AutoModel.from_pretrained(
-                        acestep_v15_checkpoint_path,
-                        trust_remote_code=True,
-                        attn_implementation=attn_implementation,
-                        dtype="bfloat16"
-                    )
-                except Exception as e:
-                    logger.warning(f"[initialize_service] Failed to load model with {attn_implementation}: {e}")
-                    if attn_implementation == "sdpa":
-                        logger.info("[initialize_service] Falling back to eager attention")
-                        attn_implementation = "eager"
                         self.model = AutoModel.from_pretrained(
-                            acestep_v15_checkpoint_path,
-                            trust_remote_code=True,
-                            attn_implementation=attn_implementation
                         )
-                    else:
-                        raise e
                 self.model.config._attn_implementation = attn_implementation
                 self.config = self.model.config
@@ -466,6 +481,8 @@ class AceStepHandler:
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
                 self.model.eval()
                 if compile_model:
                     self.model = torch.compile(self.model)
@@ -498,7 +515,8 @@ class AceStepHandler:
                         self.silence_latent = torch.load(silence_latent_path).transpose(1, 2)
                         # Always keep silence_latent on GPU - it's used in many places outside model context
                         # and is small enough that it won't significantly impact VRAM
-                        self.silence_latent = self.silence_latent.to(device).to(self.dtype)
                     else:
                         raise FileNotFoundError(f"Silence latent not found at {silence_latent_path}")
             else:
@@ -519,6 +537,8 @@ class AceStepHandler:
                     else:
                         self.vae = self.vae.to("cpu").to(vae_dtype)
                     self.vae.eval()
                 else:
                     raise FileNotFoundError(f"VAE checkpoint not found at {vae_checkpoint_path}")
@@ -534,12 +554,31 @@ class AceStepHandler:
             else:
                 if os.path.exists(text_encoder_path):
                     self.text_tokenizer = AutoTokenizer.from_pretrained(text_encoder_path)
-                    self.text_encoder = AutoModel.from_pretrained(text_encoder_path)
                     if not self.offload_to_cpu:
                         self.text_encoder = self.text_encoder.to(device).to(self.dtype)
                     else:
                         self.text_encoder = self.text_encoder.to("cpu").to(self.dtype)
                     self.text_encoder.eval()
                 else:
                     raise FileNotFoundError(f"Text encoder not found at {text_encoder_path}")
@@ -2722,9 +2761,12 @@ class AceStepHandler:
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
             return {
                 "audios": [],
-                "status_message": "❌ Model not fully initialized. Please initialize all components first.",
                 "extra_outputs": {},
                 "success": False,
                 "error": "Model not fully initialized",

         return model_path
+    def is_flash_attn3_available(self) -> bool:
+        """Check if flash-attn3 via kernels library is available"""
         try:
+            import kernels
             return True
         except ImportError:
             return False
+    def get_best_attn_implementation(self) -> str:
+        """Get the best available attention implementation"""
+        if self.is_flash_attn3_available():
+            return "kernels-community/flash-attn3"
+        elif self.is_flash_attention_available():
+            return "flash_attention_2"
+        else:
+            return "sdpa"
     def is_turbo_model(self) -> bool:
         """Check if the currently loaded model is a turbo model"""
         if self.config is None:
                 acestep_v15_checkpoint_path = self._ensure_model_downloaded(config_path, checkpoint_dir)
             if os.path.exists(acestep_v15_checkpoint_path):
+                # Determine attention implementation (prefer flash-attn3 > flash_attention_2 > sdpa)
+                if use_flash_attention:
+                    attn_implementation = self.get_best_attn_implementation()
                     self.dtype = torch.bfloat16
                 else:
                     attn_implementation = "sdpa"
+                # Try loading with the best available attention implementation, with fallbacks
+                attn_fallback_order = [attn_implementation]
+                if attn_implementation == "kernels-community/flash-attn3":
+                    attn_fallback_order.extend(["flash_attention_2", "sdpa", "eager"])
+                elif attn_implementation == "flash_attention_2":
+                    attn_fallback_order.extend(["sdpa", "eager"])
+                elif attn_implementation == "sdpa":
+                    attn_fallback_order.append("eager")
+                for attn_impl in attn_fallback_order:
+                    try:
+                        logger.info(f"[initialize_service] Attempting to load model with attention implementation: {attn_impl}")
                         self.model = AutoModel.from_pretrained(
+                            acestep_v15_checkpoint_path,
+                            trust_remote_code=True,
+                            attn_implementation=attn_impl,
+                            dtype="bfloat16"
                         )
+                        attn_implementation = attn_impl
+                        break
+                    except Exception as e:
+                        logger.warning(f"[initialize_service] Failed to load model with {attn_impl}: {e}")
+                        if attn_impl == attn_fallback_order[-1]:
+                            raise e
                 self.model.config._attn_implementation = attn_implementation
                 self.config = self.model.config
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
                 self.model.eval()
+                # Disable gradients for all parameters (required for ZeroGPU pickling)
+                self.model.requires_grad_(False)
                 if compile_model:
                     self.model = torch.compile(self.model)
                         self.silence_latent = torch.load(silence_latent_path).transpose(1, 2)
                         # Always keep silence_latent on GPU - it's used in many places outside model context
                         # and is small enough that it won't significantly impact VRAM
+                        # Use detach() to ensure no gradients (required for ZeroGPU pickling)
+                        self.silence_latent = self.silence_latent.to(device).to(self.dtype).detach()
                     else:
                         raise FileNotFoundError(f"Silence latent not found at {silence_latent_path}")
             else:
                     else:
                         self.vae = self.vae.to("cpu").to(vae_dtype)
                     self.vae.eval()
+                    # Disable gradients for all parameters (required for ZeroGPU pickling)
+                    self.vae.requires_grad_(False)
                 else:
                     raise FileNotFoundError(f"VAE checkpoint not found at {vae_checkpoint_path}")
             else:
                 if os.path.exists(text_encoder_path):
                     self.text_tokenizer = AutoTokenizer.from_pretrained(text_encoder_path)
+                    # Use best attention implementation for text encoder
+                    text_encoder_attn = self.get_best_attn_implementation()
+                    text_encoder_loaded = False
+                    for attn_impl in [text_encoder_attn, "flash_attention_2", "sdpa", "eager"]:
+                        try:
+                            self.text_encoder = AutoModel.from_pretrained(
+                                text_encoder_path,
+                                attn_implementation=attn_impl,
+                                torch_dtype=self.dtype,
+                            )
+                            logger.info(f"[initialize_service] Text encoder loaded with {attn_impl}")
+                            text_encoder_loaded = True
+                            break
+                        except Exception as e:
+                            logger.warning(f"[initialize_service] Failed to load text encoder with {attn_impl}: {e}")
+                            continue
+                    if not text_encoder_loaded:
+                        raise RuntimeError("Failed to load text encoder with any attention implementation")
                     if not self.offload_to_cpu:
                         self.text_encoder = self.text_encoder.to(device).to(self.dtype)
                     else:
                         self.text_encoder = self.text_encoder.to("cpu").to(self.dtype)
                     self.text_encoder.eval()
+                    # Disable gradients for all parameters (required for ZeroGPU pickling)
+                    self.text_encoder.requires_grad_(False)
                 else:
                     raise FileNotFoundError(f"Text encoder not found at {text_encoder_path}")
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
+            missing = [k for k, v in [("model", self.model), ("vae", self.vae),
+                       ("text_tokenizer", self.text_tokenizer), ("text_encoder", self.text_encoder)] if v is None]
+            logger.error(f"[generate_music] Model not fully initialized. Missing: {missing}")
             return {
                 "audios": [],
+                "status_message": f"❌ Model not fully initialized. Missing components: {missing}",
                 "extra_outputs": {},
                 "success": False,
                 "error": "Model not fully initialized",

acestep/inference.py CHANGED Viewed

@@ -18,20 +18,6 @@ from acestep.audio_utils import AudioSaver, generate_uuid_from_params
 # HuggingFace Space environment detection
 IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
-def _get_spaces_gpu_decorator(duration=180):
-    """
-    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
-    Returns identity decorator if not in Space environment.
-    """
-    if IS_HUGGINGFACE_SPACE:
-        try:
-            import spaces
-            return spaces.GPU(duration=duration)
-        except ImportError:
-            logger.warning("spaces package not found, GPU decorator disabled")
-            return lambda func: func
-    return lambda func: func
 @dataclass
 class GenerationParams:
@@ -289,7 +275,6 @@ def _update_metadata_from_lm(
     return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
-@_get_spaces_gpu_decorator(duration=180)
 def generate_music(
     dit_handler,
     llm_handler,
@@ -924,6 +909,19 @@ def create_sample(
         ...     print(f"Lyrics: {result.lyrics}")
         ...     print(f"BPM: {result.bpm}")
     """
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         return CreateSampleResult(

 # HuggingFace Space environment detection
 IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
 @dataclass
 class GenerationParams:
     return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
 def generate_music(
     dit_handler,
     llm_handler,
         ...     print(f"Lyrics: {result.lyrics}")
         ...     print(f"BPM: {result.bpm}")
     """
+    import torch
+    # Debug logging for ZeroGPU diagnosis
+    logger.info(f"[create_sample Debug] Entry: IS_HUGGINGFACE_SPACE={IS_HUGGINGFACE_SPACE}")
+    logger.info(f"[create_sample Debug] torch.cuda.is_available()={torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"[create_sample Debug] torch.cuda.current_device()={torch.cuda.current_device()}")
+    logger.info(f"[create_sample Debug] llm_handler.device={llm_handler.device}, llm_handler.offload_to_cpu={llm_handler.offload_to_cpu}")
+    if llm_handler.llm is not None:
+        try:
+            logger.info(f"[create_sample Debug] Model device: {next(llm_handler.llm.parameters()).device}")
+        except Exception as e:
+            logger.info(f"[create_sample Debug] Could not get model device: {e}")
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         return CreateSampleResult(

acestep/llm_inference.py CHANGED Viewed

@@ -30,6 +30,9 @@ class LLMHandler:
     # HuggingFace Space environment detection
     IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
     def __init__(self, persistent_storage_path: Optional[str] = None):
         """Initialize LLMHandler with default values"""
         self.llm = None
@@ -190,20 +193,74 @@ class LLMHandler:
             return self.build_formatted_prompt(
                 caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
             )
     def _load_pytorch_model(self, model_path: str, device: str) -> Tuple[bool, str]:
         """Load PyTorch model from path and return (success, status_message)"""
         try:
-            self.llm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
             if not self.offload_to_cpu:
                 self.llm = self.llm.to(device).to(self.dtype)
             else:
                 self.llm = self.llm.to("cpu").to(self.dtype)
             self.llm.eval()
             self.llm_backend = "pt"
             self.llm_initialized = True
             logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
-            status_msg = f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nBackend: PyTorch\nDevice: {device}"
             return True, status_msg
         except Exception as e:
             return False, f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
@@ -312,6 +369,11 @@ class LLMHandler:
             self.device = device
             self.offload_to_cpu = offload_to_cpu
             # Set dtype based on device: bfloat16 for cuda, float32 for cpu
             if dtype is None:
                 self.dtype = torch.bfloat16 if device in ["cuda", "xpu"] else torch.float32
@@ -577,8 +639,11 @@ class LLMHandler:
         )
         with self._load_model_context():
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
             # Calculate max_new_tokens based on target_duration if specified
             # 5 audio codes = 1 second, plus ~500 tokens for CoT metadata and safety margin
             if target_duration is not None and target_duration > 0:
@@ -618,7 +683,7 @@ class LLMHandler:
                     truncation=True,
                 )
                 self.llm_tokenizer.padding_side = original_padding_side
-                batch_inputs_tokenized = {k: v.to(self.device) for k, v in batch_inputs_tokenized.items()}
                 # Extract batch inputs
                 batch_input_ids = batch_inputs_tokenized['input_ids']
@@ -1988,7 +2053,8 @@ class LLMHandler:
         This allows us to call update_state() after each token generation.
         """
         model = self.llm
-        device = self.device
         # Initialize generated sequences
         generated_ids = input_ids.clone()
@@ -2088,7 +2154,8 @@ class LLMHandler:
         Batch format: [cond_input, uncond_input]
         """
         model = self.llm
-        device = self.device
         batch_size = batch_input_ids.shape[0] // 2  # Half are conditional, half are unconditional
         cond_start_idx = 0
         uncond_start_idx = batch_size
@@ -2309,7 +2376,30 @@ class LLMHandler:
         Context manager to load a model to GPU and offload it back to CPU after use.
         Only used for PyTorch backend when offload_to_cpu is True.
         """
         if not self.offload_to_cpu:
             yield
             return
@@ -2383,7 +2473,9 @@ class LLMHandler:
                 device = next(model_runner.model.parameters()).device
                 self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
                 self._hf_model_for_scoring.eval()
                 logger.info(f"HuggingFace model for scoring ready on {device}")
             return self._hf_model_for_scoring

     # HuggingFace Space environment detection
     IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+    # Force IS_ZEROGPU=True when on HuggingFace Space, as the env var detection is unreliable
+    IS_ZEROGPU = IS_HUGGINGFACE_SPACE or os.environ.get("ZEROGPU") is not None
     def __init__(self, persistent_storage_path: Optional[str] = None):
         """Initialize LLMHandler with default values"""
         self.llm = None
             return self.build_formatted_prompt(
                 caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
             )
+    def is_flash_attn3_available(self) -> bool:
+        """Check if flash-attn3 via kernels library is available"""
+        try:
+            import kernels
+            return True
+        except ImportError:
+            return False
+    def is_flash_attention_available(self) -> bool:
+        """Check if flash attention is available on the system"""
+        try:
+            import flash_attn
+            return True
+        except ImportError:
+            return False
+    def get_best_attn_implementation(self) -> str:
+        """Get the best available attention implementation"""
+        if self.is_flash_attn3_available():
+            return "kernels-community/flash-attn3"
+        elif self.is_flash_attention_available():
+            return "flash_attention_2"
+        else:
+            return "sdpa"
     def _load_pytorch_model(self, model_path: str, device: str) -> Tuple[bool, str]:
         """Load PyTorch model from path and return (success, status_message)"""
         try:
+            # Try loading with the best available attention implementation
+            attn_implementation = self.get_best_attn_implementation()
+            attn_fallback_order = [attn_implementation]
+            if attn_implementation == "kernels-community/flash-attn3":
+                attn_fallback_order.extend(["flash_attention_2", "sdpa", "eager"])
+            elif attn_implementation == "flash_attention_2":
+                attn_fallback_order.extend(["sdpa", "eager"])
+            elif attn_implementation == "sdpa":
+                attn_fallback_order.append("eager")
+            for attn_impl in attn_fallback_order:
+                try:
+                    logger.info(f"[LLM Load] Attempting to load model with attention implementation: {attn_impl}")
+                    self.llm = AutoModelForCausalLM.from_pretrained(
+                        model_path,
+                        trust_remote_code=True,
+                        attn_implementation=attn_impl,
+                        torch_dtype=self.dtype,
+                    )
+                    attn_implementation = attn_impl
+                    break
+                except Exception as e:
+                    logger.warning(f"[LLM Load] Failed to load model with {attn_impl}: {e}")
+                    if attn_impl == attn_fallback_order[-1]:
+                        raise e
+            logger.info(f"[LLM Load Debug] Model loaded with {attn_implementation}, initial device: {next(self.llm.parameters()).device}")
             if not self.offload_to_cpu:
                 self.llm = self.llm.to(device).to(self.dtype)
             else:
                 self.llm = self.llm.to("cpu").to(self.dtype)
+            logger.info(f"[LLM Load Debug] After .to(), model device: {next(self.llm.parameters()).device}")
             self.llm.eval()
+            # Disable gradients for all parameters (required for ZeroGPU pickling)
+            self.llm.requires_grad_(False)
             self.llm_backend = "pt"
             self.llm_initialized = True
             logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
+            status_msg = f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nBackend: PyTorch ({attn_implementation})\nDevice: {device}"
             return True, status_msg
         except Exception as e:
             return False, f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             self.device = device
             self.offload_to_cpu = offload_to_cpu
+            # Debug logging for ZeroGPU diagnosis
+            logger.info(f"[LLM Init Debug] IS_ZEROGPU={self.IS_ZEROGPU}, IS_HUGGINGFACE_SPACE={self.IS_HUGGINGFACE_SPACE}")
+            logger.info(f"[LLM Init Debug] torch.cuda.is_available()={torch.cuda.is_available()}")
+            logger.info(f"[LLM Init Debug] device={device}, offload_to_cpu={offload_to_cpu}")
             # Set dtype based on device: bfloat16 for cuda, float32 for cpu
             if dtype is None:
                 self.dtype = torch.bfloat16 if device in ["cuda", "xpu"] else torch.float32
         )
         with self._load_model_context():
+            # Move inputs to the same device as the model (important for ZeroGPU where model may be on CPU)
+            model_device = next(self.llm.parameters()).device
+            inputs = {k: v.to(model_device) for k, v in inputs.items()}
+            logger.info(f"[_run_pt_single Debug] Inputs moved to model device: {model_device}")
+            logger.info(f"[_run_pt_single Debug] Input actual device: {inputs['input_ids'].device}")
             # Calculate max_new_tokens based on target_duration if specified
             # 5 audio codes = 1 second, plus ~500 tokens for CoT metadata and safety margin
             if target_duration is not None and target_duration > 0:
                     truncation=True,
                 )
                 self.llm_tokenizer.padding_side = original_padding_side
+                batch_inputs_tokenized = {k: v.to(model_device) for k, v in batch_inputs_tokenized.items()}
                 # Extract batch inputs
                 batch_input_ids = batch_inputs_tokenized['input_ids']
         This allows us to call update_state() after each token generation.
         """
         model = self.llm
+        # Get device from model (important for ZeroGPU where model may be on different device than self.device)
+        device = next(model.parameters()).device
         # Initialize generated sequences
         generated_ids = input_ids.clone()
         Batch format: [cond_input, uncond_input]
         """
         model = self.llm
+        # Get device from model (important for ZeroGPU where model may be on different device than self.device)
+        device = next(model.parameters()).device
         batch_size = batch_input_ids.shape[0] // 2  # Half are conditional, half are unconditional
         cond_start_idx = 0
         uncond_start_idx = batch_size
         Context manager to load a model to GPU and offload it back to CPU after use.
         Only used for PyTorch backend when offload_to_cpu is True.
         """
+        logger.info(f"[_load_model_context Debug] Entry: offload_to_cpu={self.offload_to_cpu}, backend={self.llm_backend}, self.device={self.device}")
+        logger.info(f"[_load_model_context Debug] torch.cuda.is_available()={torch.cuda.is_available()}, IS_ZEROGPU={self.IS_ZEROGPU}")
+        model_device = None
+        if self.llm is not None:
+            model_device = next(self.llm.parameters()).device
+            logger.info(f"[_load_model_context Debug] Model current device: {model_device}")
+        # In ZeroGPU, model may be on CPU even though self.device="cuda" (due to hijacked .to() during init)
+        # Move to CUDA if available and model is on CPU
+        needs_move_to_cuda = (
+            self.llm is not None
+            and torch.cuda.is_available()
+            and model_device is not None
+            and model_device.type == "cpu"
+        )
+        if needs_move_to_cuda:
+            logger.info(f"[_load_model_context Debug] Moving model from CPU to cuda")
+            self.llm = self.llm.to("cuda").to(self.dtype)
+            logger.info(f"[_load_model_context Debug] Model now on: {next(self.llm.parameters()).device}")
         if not self.offload_to_cpu:
+            logger.info(f"[_load_model_context Debug] offload_to_cpu=False, yielding")
             yield
             return
                 device = next(model_runner.model.parameters()).device
                 self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
                 self._hf_model_for_scoring.eval()
+                # Disable gradients for all parameters (required for ZeroGPU pickling)
+                self._hf_model_for_scoring.requires_grad_(False)
                 logger.info(f"HuggingFace model for scoring ready on {device}")
             return self._hf_model_for_scoring

app.py CHANGED Viewed

@@ -1,8 +1,13 @@
 """
 ACE-Step v1.5 - HuggingFace Space Entry Point
 This file serves as the entry point for HuggingFace Space deployment.
 It initializes the service and launches the Gradio interface.
 """
 import os
 import sys
@@ -22,12 +27,26 @@ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
 import torch
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
 from acestep.dataset_handler import DatasetHandler
 from acestep.gradio_ui import create_gradio_interface
 def get_gpu_memory_gb():
     """
@@ -105,14 +124,30 @@ def main():
         print("UI will be fully functional but generation is disabled")
         print("=" * 60)
     # Get persistent storage path (auto-detect)
     persistent_storage_path = get_persistent_storage_path()
     # Detect GPU memory for auto-configuration
     gpu_memory_gb = get_gpu_memory_gb()
-    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < 16
-    if not debug_ui:
         if auto_offload:
             print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (< 16GB)")
             print("Auto-enabling CPU offload to reduce GPU memory usage")
@@ -140,7 +175,11 @@ def main():
         "SERVICE_MODE_LM_MODEL",
         "acestep-5Hz-lm-1.7B"
     )
-    backend = os.environ.get("SERVICE_MODE_BACKEND", "vllm")
     device = "auto"
     print(f"Service mode configuration:")
@@ -151,6 +190,7 @@ def main():
     print(f"  Backend: {backend}")
     print(f"  Offload to CPU: {auto_offload}")
     print(f"  DEBUG_UI: {debug_ui}")
     # Determine flash attention availability
     use_flash_attention = dit_handler.is_flash_attention_available()
@@ -230,7 +270,7 @@ def main():
         else:
             print(f"Warning: 5Hz LM initialization failed: {lm_status}", file=sys.stderr)
             init_status += f"\n{lm_status}"
     # Build available models list for UI
     available_dit_models = [config_path]
     if config_path_2 and dit_handler_2 is not None:
@@ -275,7 +315,7 @@ def main():
     # Enable queue for multi-user support
     print("Enabling queue for multi-user support...")
-    demo.queue(max_size=20, default_concurrency_limit=1)
     # Launch
     print("Launching server on 0.0.0.0:7860...")
@@ -288,4 +328,4 @@ def main():
 if __name__ == "__main__":
-    main()

 """
 ACE-Step v1.5 - HuggingFace Space Entry Point
 This file serves as the entry point for HuggingFace Space deployment.
 It initializes the service and launches the Gradio interface.
+ZeroGPU Support:
+- ZeroGPU uses the 'spaces' package to intercept CUDA operations
+- Models are loaded to "cuda" during startup but actual GPU allocation is deferred
+- Handlers are registered globally so forked processes inherit them without pickling
+- @spaces.GPU decorators are on top-level Gradio event handlers, not internal functions
+- nano-vllm uses direct CUDA APIs that bypass spaces interception, so we use PyTorch backend
 """
 import os
 import sys
 for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
     os.environ.pop(proxy_var, None)
+# Import spaces for ZeroGPU support (must be imported before torch for proper interception)
+# This is a no-op if not running on HuggingFace Spaces
+try:
+    import spaces
+    HAS_SPACES = True
+except ImportError:
+    HAS_SPACES = False
 import torch
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
 from acestep.dataset_handler import DatasetHandler
 from acestep.gradio_ui import create_gradio_interface
+# Detect ZeroGPU environment
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+# ZeroGPU detection: check env var OR assume ZeroGPU for all HF Spaces (safer default)
+# The SPACE_HARDWARE env var is unreliable, so we assume ZeroGPU if on HF Space
+IS_ZEROGPU = IS_HUGGINGFACE_SPACE or os.environ.get("ZEROGPU") is not None
 def get_gpu_memory_gb():
     """
         print("UI will be fully functional but generation is disabled")
         print("=" * 60)
+    # Log ZeroGPU detection
+    if IS_ZEROGPU:
+        print("=" * 60)
+        print("ZeroGPU environment detected")
+        print("- Using spaces package for GPU allocation")
+        print("- PyTorch backend forced for LLM (nano-vllm incompatible)")
+        print("- GPU will be allocated on-demand during generation")
+        print("=" * 60)
     # Get persistent storage path (auto-detect)
     persistent_storage_path = get_persistent_storage_path()
     # Detect GPU memory for auto-configuration
+    # Note: In ZeroGPU, GPU may not be available during startup, so this may return 0
     gpu_memory_gb = get_gpu_memory_gb()
+    # For ZeroGPU, we don't need CPU offload as GPU is allocated dynamically
+    if IS_ZEROGPU:
+        auto_offload = False
+        print("ZeroGPU: CPU offload disabled (GPU allocated on-demand)")
+    else:
+        auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < 16
+    if not debug_ui and not IS_ZEROGPU:
         if auto_offload:
             print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (< 16GB)")
             print("Auto-enabling CPU offload to reduce GPU memory usage")
         "SERVICE_MODE_LM_MODEL",
         "acestep-5Hz-lm-1.7B"
     )
+    # For ZeroGPU, force PyTorch backend (nano-vllm uses direct CUDA APIs)
+    if IS_ZEROGPU:
+        backend = "pt"
+    else:
+        backend = os.environ.get("SERVICE_MODE_BACKEND", "vllm")
     device = "auto"
     print(f"Service mode configuration:")
     print(f"  Backend: {backend}")
     print(f"  Offload to CPU: {auto_offload}")
     print(f"  DEBUG_UI: {debug_ui}")
+    print(f"  ZeroGPU: {IS_ZEROGPU}")
     # Determine flash attention availability
     use_flash_attention = dit_handler.is_flash_attention_available()
         else:
             print(f"Warning: 5Hz LM initialization failed: {lm_status}", file=sys.stderr)
             init_status += f"\n{lm_status}"
     # Build available models list for UI
     available_dit_models = [config_path]
     if config_path_2 and dit_handler_2 is not None:
     # Enable queue for multi-user support
     print("Enabling queue for multi-user support...")
+    demo.queue(max_size=20)
     # Launch
     print("Launching server on 0.0.0.0:7860...")
 if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,11 +1,7 @@
 # PyTorch with CUDA 12.8 (for Windows/Linux)
---extra-index-url https://download.pytorch.org/whl/cu128
-torch==2.7.1; sys_platform == 'win32'
-torchaudio==2.7.1; sys_platform == 'win32'
-torchvision; sys_platform == 'win32'
-torch>=2.9.1; sys_platform != 'win32'
-torchaudio>=2.9.1; sys_platform != 'win32'
-torchvision; sys_platform != 'win32'
 # Core dependencies
 transformers>=4.51.0,<4.58.0
@@ -14,6 +10,7 @@ gradio==6.2.0
 matplotlib>=3.7.5
 scipy>=1.10.1
 soundfile>=0.13.1
 loguru>=0.7.3
 einops>=0.8.1
 accelerate>=1.12.0
@@ -33,6 +30,8 @@ triton-windows>=3.0.0,<3.4; sys_platform == 'win32'
 triton>=3.0.0; sys_platform != 'win32'
 flash-attn @ https://github.com/sdbds/flash-attention-for-windows/releases/download/2.8.2/flash_attn-2.8.2+cu128torch2.7.1cxx11abiFALSEfullbackward-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11' and platform_machine == 'AMD64'
 flash-attn @ https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.12/flash_attn-2.8.3+cu128torch2.10-cp311-cp311-linux_x86_64.whl ; sys_platform == 'linux' and python_version == '3.11'
 xxhash
 # HuggingFace Space required

 # PyTorch with CUDA 12.8 (for Windows/Linux)
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
 # Core dependencies
 transformers>=4.51.0,<4.58.0
 matplotlib>=3.7.5
 scipy>=1.10.1
 soundfile>=0.13.1
+ffmpeg-python
 loguru>=0.7.3
 einops>=0.8.1
 accelerate>=1.12.0
 triton>=3.0.0; sys_platform != 'win32'
 flash-attn @ https://github.com/sdbds/flash-attention-for-windows/releases/download/2.8.2/flash_attn-2.8.2+cu128torch2.7.1cxx11abiFALSEfullbackward-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11' and platform_machine == 'AMD64'
 flash-attn @ https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.12/flash_attn-2.8.3+cu128torch2.10-cp311-cp311-linux_x86_64.whl ; sys_platform == 'linux' and python_version == '3.11'
+# Kernels library for flash-attn3 (preferred over flash-attn when available)
+kernels
 xxhash
 # HuggingFace Space required