Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

Gong Junmin commited on Jan 14

Commit

d984ea0

unverified ·

2 Parent(s): 06446b3 85c5902

Merge pull request #5 from ace-step/fix_transcribe_audio_codes

Browse files

Files changed (2) hide show

acestep/gradio_ui/events/generation_handlers.py +40 -76
acestep/inference.py +170 -0

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -13,6 +13,7 @@ from acestep.constants import (
     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
 def load_metadata(file_obj):
@@ -206,6 +207,9 @@ def load_random_example(task_type: str):
 def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug: bool = False):
     """Smart sample function that uses LM if initialized, otherwise falls back to examples
     Args:
         llm_handler: LLM handler instance
         task_type: The task type (e.g., "text2music")
@@ -216,50 +220,28 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
     """
     # Check if LM is initialized
     if llm_handler.llm_initialized:
-        # Use LM to generate example
         try:
-            # Generate example using LM with empty input (NO USER INPUT)
-            metadata, status = llm_handler.understand_audio_from_codes(
-                audio_codes="NO USER INPUT",
-                use_constrained_decoding=True,
                 temperature=0.85,
                 constrained_decoding_debug=constrained_decoding_debug,
             )
-            if metadata:
-                caption_value = metadata.get('caption', '')
-                lyrics_value = metadata.get('lyrics', '')
-                think_value = True  # Always enable think when using LM-generated examples
-                # Extract optional metadata fields
-                bpm_value = None
-                if 'bpm' in metadata and metadata['bpm'] not in [None, "N/A", ""]:
-                    try:
-                        bpm_value = int(metadata['bpm'])
-                    except (ValueError, TypeError):
-                        pass
-                duration_value = None
-                if 'duration' in metadata and metadata['duration'] not in [None, "N/A", ""]:
-                    try:
-                        duration_value = float(metadata['duration'])
-                    except (ValueError, TypeError):
-                        pass
-                keyscale_value = metadata.get('keyscale', '')
-                if keyscale_value in [None, "N/A"]:
-                    keyscale_value = ''
-                language_value = metadata.get('language', '')
-                if language_value in [None, "N/A"]:
-                    language_value = ''
-                timesignature_value = metadata.get('timesignature', '')
-                if timesignature_value in [None, "N/A"]:
-                    timesignature_value = ''
                 gr.Info(t("messages.lm_generated"))
-                return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
             else:
                 gr.Warning(t("messages.lm_fallback"))
                 return load_random_example(task_type)
@@ -437,58 +419,40 @@ def transcribe_audio_codes(llm_handler, audio_code_string, constrained_decoding_
     Transcribe audio codes to metadata using LLM understanding.
     If audio_code_string is empty, generate a sample example instead.
     Args:
         llm_handler: LLM handler instance
         audio_code_string: String containing audio codes (or empty for example generation)
         constrained_decoding_debug: Whether to enable debug logging for constrained decoding
     Returns:
-        Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature)
     """
-    if not llm_handler.llm_initialized:
-        return t("messages.lm_not_initialized"), "", "", None, None, "", "", ""
-    # If codes are empty, this becomes a "generate example" task
-    # Use "NO USER INPUT" as the input to generate a sample
-    if not audio_code_string or not audio_code_string.strip():
-        audio_code_string = "NO USER INPUT"
-    # Call LLM understanding
-    metadata, status = llm_handler.understand_audio_from_codes(
         audio_codes=audio_code_string,
         use_constrained_decoding=True,
         constrained_decoding_debug=constrained_decoding_debug,
     )
-    # Extract fields for UI update
-    caption = metadata.get('caption', '')
-    lyrics = metadata.get('lyrics', '')
-    bpm = metadata.get('bpm')
-    duration = metadata.get('duration')
-    keyscale = metadata.get('keyscale', '')
-    language = metadata.get('language', '')
-    timesignature = metadata.get('timesignature', '')
-    # Convert to appropriate types
-    try:
-        bpm = int(bpm) if bpm and bpm != 'N/A' else None
-    except:
-        bpm = None
-    try:
-        duration = float(duration) if duration and duration != 'N/A' else None
-    except:
-        duration = None
     return (
-        status,
-        caption,
-        lyrics,
-        bpm,
-        duration,
-        keyscale,
-        language,
-        timesignature,
         True  # Set is_format_caption to True (from Transcribe/LM understanding)
     )

     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music
 def load_metadata(file_obj):
 def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug: bool = False):
     """Smart sample function that uses LM if initialized, otherwise falls back to examples
+    This is a Gradio wrapper that uses the understand_music API from acestep.inference
+    to generate examples when LM is available.
     Args:
         llm_handler: LLM handler instance
         task_type: The task type (e.g., "text2music")
     """
     # Check if LM is initialized
     if llm_handler.llm_initialized:
+        # Use LM to generate example via understand_music API
         try:
+            result = understand_music(
+                llm_handler=llm_handler,
+                audio_codes="NO USER INPUT",  # Empty input triggers example generation
                 temperature=0.85,
+                use_constrained_decoding=True,
                 constrained_decoding_debug=constrained_decoding_debug,
             )
+            if result.success:
                 gr.Info(t("messages.lm_generated"))
+                return (
+                    result.caption,
+                    result.lyrics,
+                    True,  # Always enable think when using LM-generated examples
+                    result.bpm,
+                    result.duration,
+                    result.keyscale,
+                    result.language,
+                    result.timesignature,
+                )
             else:
                 gr.Warning(t("messages.lm_fallback"))
                 return load_random_example(task_type)
     Transcribe audio codes to metadata using LLM understanding.
     If audio_code_string is empty, generate a sample example instead.
+    This is a Gradio wrapper around the understand_music API in acestep.inference.
     Args:
         llm_handler: LLM handler instance
         audio_code_string: String containing audio codes (or empty for example generation)
         constrained_decoding_debug: Whether to enable debug logging for constrained decoding
     Returns:
+        Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature, is_format_caption)
     """
+    # Call the inference API
+    result = understand_music(
+        llm_handler=llm_handler,
         audio_codes=audio_code_string,
         use_constrained_decoding=True,
         constrained_decoding_debug=constrained_decoding_debug,
     )
+    # Handle error case with localized message
+    if not result.success:
+        # Use localized error message for LLM not initialized
+        if result.error == "LLM not initialized":
+            return t("messages.lm_not_initialized"), "", "", None, None, "", "", "", False
+        return result.status_message, "", "", None, None, "", "", "", False
     return (
+        result.status_message,
+        result.caption,
+        result.lyrics,
+        result.bpm,
+        result.duration,
+        result.keyscale,
+        result.language,
+        result.timesignature,
         True  # Set is_format_caption to True (from Transcribe/LM understanding)
     )

acestep/inference.py CHANGED Viewed

@@ -183,6 +183,44 @@ class GenerationResult:
         return asdict(self)
 def _update_metadata_from_lm(
     metadata: Dict[str, Any],
     bpm: Optional[int],
@@ -627,3 +665,135 @@ def generate_music(
             success=False,
             error=str(e),
         )

         return asdict(self)
+@dataclass
+class UnderstandResult:
+    """Result of music understanding from audio codes.
+    Attributes:
+        # Metadata Fields
+        caption: Generated caption describing the music
+        lyrics: Generated or extracted lyrics
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4/4")
+        # Status
+        status_message: Status message from understanding
+        success: Whether understanding completed successfully
+        error: Error message if understanding failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
 def _update_metadata_from_lm(
     metadata: Dict[str, Any],
     bpm: Optional[int],
             success=False,
             error=str(e),
         )
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    cfg_scale: float = 1.0,
+    negative_prompt: str = "NO USER INPUT",
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult:
+    """Understand music from audio codes using the 5Hz Language Model.
+    This function analyzes audio semantic codes and generates metadata about the music,
+    including caption, lyrics, BPM, duration, key scale, language, and time signature.
+    If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
+    instead of analyzing existing codes.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
+                     Use empty string or "NO USER INPUT" to generate a sample example.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
+        negative_prompt: Negative prompt for CFG guidance
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        UnderstandResult with parsed metadata fields and status
+    Example:
+        >>> result = understand_music(llm_handler, audio_codes="<|audio_code_123|>...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return UnderstandResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    # If codes are empty, use "NO USER INPUT" to generate a sample example
+    if not audio_codes or not audio_codes.strip():
+        audio_codes = "NO USER INPUT"
+    try:
+        # Call LLM understanding
+        metadata, status = llm_handler.understand_audio_from_codes(
+            audio_codes=audio_codes,
+            temperature=temperature,
+            cfg_scale=cfg_scale,
+            negative_prompt=negative_prompt,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return UnderstandResult(
+                status_message=status or "Failed to understand audio codes",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return UnderstandResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music understanding failed")
+        return UnderstandResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )