Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Jan 5

Commit

1241c80

1 Parent(s): 26b4474

cot caption & language LM

Browse files

Files changed (6) hide show

acestep/api_server.py +7 -3
acestep/constants.py +97 -0
acestep/constrained_logits_processor.py +385 -23
acestep/gradio_ui.py +48 -17
acestep/handler.py +84 -33
acestep/llm_inference.py +86 -24

acestep/api_server.py CHANGED Viewed

@@ -35,6 +35,10 @@ from starlette.datastructures import UploadFile as StarletteUploadFile
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
 JobStatus = Literal["queued", "running", "succeeded", "failed"]
@@ -70,7 +74,7 @@ class GenerateMusicRequest(BaseModel):
     repainting_start: float = 0.0
     repainting_end: Optional[float] = None
-    instruction: str = "Fill the audio semantic mask based on the given conditions:"
     audio_cover_strength: float = 1.0
     task_type: str = "text2music"
@@ -102,8 +106,8 @@ class GenerateMusicRequest(BaseModel):
 _LM_DEFAULT_TEMPERATURE = 0.85
 _LM_DEFAULT_CFG_SCALE = 2.0
 _LM_DEFAULT_TOP_P = 0.9
-_DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
-_DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
 class CreateJobResponse(BaseModel):

 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
+from acestep.constants import (
+    DEFAULT_DIT_INSTRUCTION,
+    DEFAULT_LM_INSTRUCTION,
+)
 JobStatus = Literal["queued", "running", "succeeded", "failed"]
     repainting_start: float = 0.0
     repainting_end: Optional[float] = None
+    instruction: str = DEFAULT_DIT_INSTRUCTION
     audio_cover_strength: float = 1.0
     task_type: str = "text2music"
 _LM_DEFAULT_TEMPERATURE = 0.85
 _LM_DEFAULT_CFG_SCALE = 2.0
 _LM_DEFAULT_TOP_P = 0.9
+_DEFAULT_DIT_INSTRUCTION = DEFAULT_DIT_INSTRUCTION
+_DEFAULT_LM_INSTRUCTION = DEFAULT_LM_INSTRUCTION
 class CreateJobResponse(BaseModel):

acestep/constants.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Constants for ACE-Step
+Centralized constants used across the codebase
+"""
+# ==============================================================================
+# Language Constants
+# ==============================================================================
+VALID_LANGUAGES = [
+    'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
+    'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
+    'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
+    'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
+    'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
+    'unknown'
+]
+# ==============================================================================
+# Keyscale Constants
+# ==============================================================================
+KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭']  # empty + ASCII sharp/flat + Unicode sharp/flat
+KEYSCALE_MODES = ['major', 'minor']
+# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
+VALID_KEYSCALES = set()
+for note in KEYSCALE_NOTES:
+    for acc in KEYSCALE_ACCIDENTALS:
+        for mode in KEYSCALE_MODES:
+            VALID_KEYSCALES.add(f"{note}{acc} {mode}")
+# ==============================================================================
+# Metadata Range Constants
+# ==============================================================================
+# BPM (Beats Per Minute) range
+BPM_MIN = 30
+BPM_MAX = 300
+# Duration range (in seconds)
+DURATION_MIN = 10
+DURATION_MAX = 600
+# Valid time signatures
+VALID_TIME_SIGNATURES = [2, 3, 4, 6]
+# ==============================================================================
+# Task Type Constants
+# ==============================================================================
+TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# Task types available for turbo models (subset)
+TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]
+# Task types available for base models (full set)
+TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# ==============================================================================
+# Instruction Constants
+# ==============================================================================
+# Default instructions
+DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
+DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
+# Instruction templates for each task type
+# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
+# These should be formatted using .format() or f-strings when used
+TASK_INSTRUCTIONS = {
+    "text2music": "Fill the audio semantic mask based on the given conditions:",
+    "repaint": "Repaint the mask area based on the given conditions:",
+    "cover": "Generate audio semantic tokens based on the given conditions:",
+    "extract": "Extract the {TRACK_NAME} track from the audio:",
+    "extract_default": "Extract the track from the audio:",
+    "lego": "Generate the {TRACK_NAME} track based on the audio context:",
+    "lego_default": "Generate the track based on the audio context:",
+    "complete": "Complete the input track with {TRACK_CLASSES}:",
+    "complete_default": "Complete the input track:",
+}
+# ==============================================================================
+# Track/Instrument Constants
+# ==============================================================================
+TRACK_NAMES = [
+    "woodwinds", "brass", "fx", "synth", "strings", "percussion",
+    "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
+]

acestep/constrained_logits_processor.py CHANGED Viewed

@@ -6,6 +6,18 @@ from transformers import AutoTokenizer
 from transformers.generation.logits_process import LogitsProcessor
 import os
 import torch
 # ==============================================================================
@@ -18,6 +30,8 @@ class FSMState(Enum):
     BPM_NAME = auto()            # Generating "bpm: "
     BPM_VALUE = auto()           # Generating numeric value 30-300
     NEWLINE_AFTER_BPM = auto()   # Generating "\n" after bpm value
     DURATION_NAME = auto()       # Generating "duration: "
     DURATION_VALUE = auto()      # Generating numeric value 10-600
     NEWLINE_AFTER_DURATION = auto()
@@ -27,6 +41,8 @@ class FSMState(Enum):
     KEYSCALE_NAME = auto()       # Generating "keyscale: "
     KEYSCALE_VALUE = auto()      # Generating keyscale pattern
     NEWLINE_AFTER_KEYSCALE = auto()
     TIMESIG_NAME = auto()        # Generating "timesignature: "
     TIMESIG_VALUE = auto()       # Generating 2, 3, 4, or 6
     NEWLINE_AFTER_TIMESIG = auto()
@@ -42,15 +58,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
     This processor enforces the following format:
     <think>
     bpm: [30-300]
     duration: [10-600]
-    genres: [any non-empty string]
     keyscale: [A-G][#/♭]? [major/minor]
     timesignature: [2/3/4/6]
     </think>
     It uses token masking (setting invalid token logits to -inf) to enforce constraints.
     For numeric fields, it uses early-blocking to prevent out-of-range values.
     For field transitions (e.g., end of numeric value), it compares P(newline) vs P(digit).
     """
     def __init__(
@@ -80,15 +99,19 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         self.enabled = enabled
         self.debug = debug
         self.skip_genres = skip_genres
         self.caption: Optional[str] = None  # Set via update_caption() before each generation
         # User-provided metadata fields (optional)
         # If provided, these fields will be used directly instead of generating
-        # Format: {"bpm": "120", "duration": "234", "keyscale": "G major", "timesignature": "4", "genres": "Pop Rock"}
         self.user_provided_metadata: Dict[str, Optional[str]] = {
             "bpm": None,
             "duration": None,
             "keyscale": None,
             "timesignature": None,
             "genres": None,
         }
@@ -114,6 +137,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         self.accumulated_value = ""  # For numeric/text value accumulation (legacy, for compatibility)
         self.accumulated_token_ids: List[int] = []  # Token ID sequence for keyscale (and other fields)
         # Token queue for user-provided fields (injected directly without generation)
         self.user_field_token_queue: List[int] = []
         self.current_user_field: Optional[str] = None  # Current field being injected
@@ -137,9 +164,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Field definitions (needed before building prefix trees)
         self.field_specs = {
-            "bpm": {"min": 30, "max": 300},
-            "duration": {"min": 10, "max": 600},
-            "timesignature": {"valid_values": [2, 3, 4, 6]},
         }
         # Build valid numeric values for BPM, Duration, Timesignature
@@ -170,6 +197,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             context_prefix_for_tokenization="timesignature: "
         )
         self._load_genres_vocab()
         # Note: Caption-based genre filtering is initialized via update_caption() before each generation
@@ -182,9 +212,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             FSMState.THINK_TAG: "<think>",
             FSMState.NEWLINE_AFTER_THINK: "\n",
             FSMState.BPM_NAME: "bpm:",
             FSMState.DURATION_NAME: "duration:",
             FSMState.GENRES_NAME: "genres:",
             FSMState.KEYSCALE_NAME: "keyscale:",
             FSMState.TIMESIG_NAME: "timesignature:",
             FSMState.THINK_END_TAG: "</think>",
         }
@@ -198,17 +230,21 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         even if the field is user-provided (we still need to generate the field name).
         Args:
-            current_field: Current field name ("bpm", "duration", "genres", "keyscale", "timesignature")
         Returns:
             Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
         """
-        field_order = ["bpm", "duration", "genres", "keyscale", "timesignature"]
         field_to_state = {
             "bpm": FSMState.BPM_NAME,
             "duration": FSMState.DURATION_NAME,
             "genres": FSMState.GENRES_NAME,
             "keyscale": FSMState.KEYSCALE_NAME,
             "timesignature": FSMState.TIMESIG_NAME,
         }
@@ -221,9 +257,13 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         for i in range(current_idx + 1, len(field_order)):
             field = field_order[i]
-            # Skip genres if skip_genres is True
             if field == "genres" and self.skip_genres:
                 continue
             # Return the next field's NAME state (even if user-provided, we still generate field name)
             return field_to_state[field]
@@ -241,12 +281,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         }
         # Build transitions for all fields (even if user-provided, we still need to generate field name)
-        # Field order: bpm -> duration -> genres -> keyscale -> timesignature
-        # BPM field: NAME -> VALUE -> next field
         self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
         self.next_state[FSMState.BPM_VALUE] = self._get_next_field_state("bpm")
         # Duration field: NAME -> VALUE -> next field
         self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
         self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
@@ -256,10 +301,15 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             self.next_state[FSMState.GENRES_NAME] = FSMState.GENRES_VALUE
             self.next_state[FSMState.GENRES_VALUE] = self._get_next_field_state("genres")
-        # Keyscale field: NAME -> VALUE -> next field
         self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
         self.next_state[FSMState.KEYSCALE_VALUE] = self._get_next_field_state("keyscale")
         # Timesignature field: NAME -> VALUE -> THINK_END_TAG
         self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
         self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
@@ -269,6 +319,49 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         self.skip_genres = skip
         self._build_state_transitions()
     def set_stop_at_reasoning(self, stop: bool):
         """
         Set whether to stop generation after </think> tag.
@@ -287,8 +380,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         Args:
             metadata: Dictionary with optional fields:
                 - "bpm": Optional[str] - e.g., "120"
                 - "duration": Optional[str] - e.g., "234"
                 - "keyscale": Optional[str] - e.g., "G major"
                 - "timesignature": Optional[str] - e.g., "4"
                 - "genres": Optional[str] - e.g., "Pop Rock"
                 If None, clears all user-provided metadata.
@@ -297,7 +392,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             metadata = {}
         # Update user-provided metadata
-        for field in ["bpm", "duration", "keyscale", "timesignature", "genres"]:
             if field in metadata:
                 self.user_provided_metadata[field] = metadata[field]
             else:
@@ -328,7 +423,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Note tokens for keyscale (A-G)
         self.note_tokens = {}
-        for note in "ABCDEFG":
             tokens = self.tokenizer.encode(note, add_special_tokens=False)
             if tokens:
                 self.note_tokens[note] = tokens[-1]
@@ -370,21 +465,80 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # EOS token for duration-constrained codes generation
         self.eos_token_id = self.tokenizer.eos_token_id
         # Build valid keyscales set (prefix tree will be built after _char_to_tokens is initialized)
         # 7 notes × 5 accidentals (none, #, b, ♯, ♭) × 2 modes = 70 valid combinations
-        notes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
-        accidentals = ['', '#', 'b', '♯', '♭']  # empty + ASCII sharp/flat + Unicode sharp/flat
-        modes = ['major', 'minor']
-        self.valid_keyscales = set()
-        for note in notes:
-            for acc in accidentals:
-                for mode in modes:
-                    self.valid_keyscales.add(f"{note}{acc} {mode}")
         # keyscale_prefix_tree will be built in _precompute_char_token_mapping() after _char_to_tokens is ready
         # Numeric prefix trees will be built after field_specs is defined
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
@@ -560,6 +714,68 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         return prefix_to_tokens
     def diagnose_keyscale_prefix_tree(self):
         """
         Diagnose the keyscale prefix tree to help debug generation bias.
@@ -926,6 +1142,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         self.codes_count = 0  # Reset codes counter
         self.user_field_token_queue = []  # Reset user field token queue
         self.current_user_field = None  # Reset current user field
     def set_target_duration(self, duration: Optional[float]):
         """
@@ -1170,6 +1388,20 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             return self.newline_token in self.keyscale_prefix_tree[token_prefix]
         return False
     def _get_allowed_timesig_tokens(self) -> List[int]:
         """
         Get allowed tokens for timesignature field using the precomputed prefix tree.
@@ -1269,7 +1501,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         Uses the same tokenization logic as prefix tree building.
         Args:
-            field_name: Field name ("bpm", "duration", "keyscale", "timesignature", "genres")
         Returns:
             List of token IDs for the complete field, or None if field is not provided
@@ -1281,8 +1513,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Build full field string with space (matching prefix tree tokenization)
         field_to_prefix = {
             "bpm": "bpm: ",
             "duration": "duration: ",
             "keyscale": "keyscale: ",
             "timesignature": "timesignature: ",
             "genres": "genres: ",
         }
@@ -1410,6 +1644,67 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             scores = scores + mask
         elif self.state == FSMState.DURATION_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["duration"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
@@ -1539,6 +1834,43 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                         mask[0, self.newline_token] = 0
                     scores = scores + mask
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["timesignature"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
@@ -1587,6 +1919,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             self.position_in_state = 0
             self.accumulated_value = ""  # Legacy, kept for compatibility
             self.accumulated_token_ids = []  # Reset token ID sequence for new field
             if self.debug:
                 logger.debug(f"FSM transition: {old_state.name} -> {self.state.name}")
@@ -1703,6 +2037,22 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 # Genres still uses string-based trie, so keep accumulated_value
                 self.accumulated_value += token_str
         elif self.state == FSMState.KEYSCALE_VALUE:
             if generated_token_id == self.newline_token:
                 # Newline ends the field
@@ -1718,4 +2068,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 self.accumulated_token_ids.append(generated_token_id)
                 # Also update legacy accumulated_value for compatibility
                 self.accumulated_value += token_str

 from transformers.generation.logits_process import LogitsProcessor
 import os
 import torch
+from acestep.constants import (
+    VALID_LANGUAGES,
+    KEYSCALE_NOTES,
+    KEYSCALE_ACCIDENTALS,
+    KEYSCALE_MODES,
+    VALID_KEYSCALES,
+    BPM_MIN,
+    BPM_MAX,
+    DURATION_MIN,
+    DURATION_MAX,
+    VALID_TIME_SIGNATURES,
+)
 # ==============================================================================
     BPM_NAME = auto()            # Generating "bpm: "
     BPM_VALUE = auto()           # Generating numeric value 30-300
     NEWLINE_AFTER_BPM = auto()   # Generating "\n" after bpm value
+    CAPTION_NAME = auto()        # Generating "caption: "
+    CAPTION_VALUE = auto()       # Generating caption text (no code blocks/newlines)
     DURATION_NAME = auto()       # Generating "duration: "
     DURATION_VALUE = auto()      # Generating numeric value 10-600
     NEWLINE_AFTER_DURATION = auto()
     KEYSCALE_NAME = auto()       # Generating "keyscale: "
     KEYSCALE_VALUE = auto()      # Generating keyscale pattern
     NEWLINE_AFTER_KEYSCALE = auto()
+    LANGUAGE_NAME = auto()       # Generating "language: "
+    LANGUAGE_VALUE = auto()      # Generating language code (en, zh, ja, etc.)
     TIMESIG_NAME = auto()        # Generating "timesignature: "
     TIMESIG_VALUE = auto()       # Generating 2, 3, 4, or 6
     NEWLINE_AFTER_TIMESIG = auto()
     This processor enforces the following format:
     <think>
     bpm: [30-300]
+    caption: [text without code blocks, ends with period + newline]
     duration: [10-600]
     keyscale: [A-G][#/♭]? [major/minor]
+    language: [en/zh/ja/ko/es/fr/de/uk/ru/...]
     timesignature: [2/3/4/6]
     </think>
     It uses token masking (setting invalid token logits to -inf) to enforce constraints.
     For numeric fields, it uses early-blocking to prevent out-of-range values.
     For field transitions (e.g., end of numeric value), it compares P(newline) vs P(digit).
+    For caption field, it blocks code blocks and newlines, and only transitions when
+    the previous token was a period and newline has the highest probability.
     """
     def __init__(
         self.enabled = enabled
         self.debug = debug
         self.skip_genres = skip_genres
+        self.skip_caption = False  # Set to True to skip caption field generation
+        self.skip_language = False  # Set to True to skip language field generation
         self.caption: Optional[str] = None  # Set via update_caption() before each generation
         # User-provided metadata fields (optional)
         # If provided, these fields will be used directly instead of generating
+        # Format: {"bpm": "120", "caption": "...", "duration": "234", "keyscale": "G major", "language": "en", "timesignature": "4", "genres": "Pop Rock"}
         self.user_provided_metadata: Dict[str, Optional[str]] = {
             "bpm": None,
+            "caption": None,
             "duration": None,
             "keyscale": None,
+            "language": None,
             "timesignature": None,
             "genres": None,
         }
         self.accumulated_value = ""  # For numeric/text value accumulation (legacy, for compatibility)
         self.accumulated_token_ids: List[int] = []  # Token ID sequence for keyscale (and other fields)
+        # Caption generation state tracking
+        self.caption_after_newline = False  # Track if we're right after a newline in caption
+        self.caption_token_count = 0  # Track token count for caption (max 512)
         # Token queue for user-provided fields (injected directly without generation)
         self.user_field_token_queue: List[int] = []
         self.current_user_field: Optional[str] = None  # Current field being injected
         # Field definitions (needed before building prefix trees)
         self.field_specs = {
+            "bpm": {"min": BPM_MIN, "max": BPM_MAX},
+            "duration": {"min": DURATION_MIN, "max": DURATION_MAX},
+            "timesignature": {"valid_values": VALID_TIME_SIGNATURES},
         }
         # Build valid numeric values for BPM, Duration, Timesignature
             context_prefix_for_tokenization="timesignature: "
         )
+        # Build language prefix tree (similar to keyscale but for language codes)
+        self.language_prefix_tree = self._build_language_prefix_tree()
         self._load_genres_vocab()
         # Note: Caption-based genre filtering is initialized via update_caption() before each generation
             FSMState.THINK_TAG: "<think>",
             FSMState.NEWLINE_AFTER_THINK: "\n",
             FSMState.BPM_NAME: "bpm:",
+            FSMState.CAPTION_NAME: "caption:",
             FSMState.DURATION_NAME: "duration:",
             FSMState.GENRES_NAME: "genres:",
             FSMState.KEYSCALE_NAME: "keyscale:",
+            FSMState.LANGUAGE_NAME: "language:",
             FSMState.TIMESIG_NAME: "timesignature:",
             FSMState.THINK_END_TAG: "</think>",
         }
         even if the field is user-provided (we still need to generate the field name).
         Args:
+            current_field: Current field name ("bpm", "caption", "duration", "genres", "keyscale", "language", "timesignature")
         Returns:
             Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
         """
+        # New field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
+        # genres is optional and can be skipped
+        field_order = ["bpm", "caption", "duration", "genres", "keyscale", "language", "timesignature"]
         field_to_state = {
             "bpm": FSMState.BPM_NAME,
+            "caption": FSMState.CAPTION_NAME,
             "duration": FSMState.DURATION_NAME,
             "genres": FSMState.GENRES_NAME,
             "keyscale": FSMState.KEYSCALE_NAME,
+            "language": FSMState.LANGUAGE_NAME,
             "timesignature": FSMState.TIMESIG_NAME,
         }
         for i in range(current_idx + 1, len(field_order)):
             field = field_order[i]
+            # Skip fields based on flags
             if field == "genres" and self.skip_genres:
                 continue
+            if field == "caption" and self.skip_caption:
+                continue
+            if field == "language" and self.skip_language:
+                continue
             # Return the next field's NAME state (even if user-provided, we still generate field name)
             return field_to_state[field]
         }
         # Build transitions for all fields (even if user-provided, we still need to generate field name)
+        # Field order: bpm -> caption -> duration -> genres -> keyscale -> language -> timesignature
+        # BPM field: NAME -> VALUE -> next field (caption or duration)
         self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
         self.next_state[FSMState.BPM_VALUE] = self._get_next_field_state("bpm")
+        # Caption field (only if not skipped): NAME -> VALUE -> next field (duration)
+        if not self.skip_caption:
+            self.next_state[FSMState.CAPTION_NAME] = FSMState.CAPTION_VALUE
+            self.next_state[FSMState.CAPTION_VALUE] = self._get_next_field_state("caption")
         # Duration field: NAME -> VALUE -> next field
         self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
         self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
             self.next_state[FSMState.GENRES_NAME] = FSMState.GENRES_VALUE
             self.next_state[FSMState.GENRES_VALUE] = self._get_next_field_state("genres")
+        # Keyscale field: NAME -> VALUE -> next field (language or timesignature)
         self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
         self.next_state[FSMState.KEYSCALE_VALUE] = self._get_next_field_state("keyscale")
+        # Language field (only if not skipped): NAME -> VALUE -> next field (timesignature)
+        if not self.skip_language:
+            self.next_state[FSMState.LANGUAGE_NAME] = FSMState.LANGUAGE_VALUE
+            self.next_state[FSMState.LANGUAGE_VALUE] = self._get_next_field_state("language")
         # Timesignature field: NAME -> VALUE -> THINK_END_TAG
         self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
         self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
         self.skip_genres = skip
         self._build_state_transitions()
+    def set_skip_caption(self, skip: bool):
+        """Set whether to skip caption generation and rebuild state transitions."""
+        self.skip_caption = skip
+        self._build_state_transitions()
+    def set_skip_language(self, skip: bool):
+        """Set whether to skip language generation and rebuild state transitions."""
+        self.skip_language = skip
+        self._build_state_transitions()
+    @staticmethod
+    def postprocess_caption(caption: str) -> str:
+        """
+        Post-process caption to remove YAML multi-line formatting.
+        Converts YAML-style multi-line text (with newlines and leading spaces)
+        to a single-line string.
+        Example:
+            Input:  "An emotional ballad.\\n  The track opens with piano.\\n  More text."
+            Output: "An emotional ballad. The track opens with piano. More text."
+        Args:
+            caption: Raw caption text with possible YAML formatting
+        Returns:
+            Clean single-line caption
+        """
+        if not caption:
+            return caption
+        # Split by newlines
+        lines = caption.split('\n')
+        # Process each line: strip leading/trailing whitespace
+        cleaned_lines = []
+        for line in lines:
+            stripped = line.strip()
+            if stripped:
+                cleaned_lines.append(stripped)
+        # Join with single space
+        return ' '.join(cleaned_lines)
     def set_stop_at_reasoning(self, stop: bool):
         """
         Set whether to stop generation after </think> tag.
         Args:
             metadata: Dictionary with optional fields:
                 - "bpm": Optional[str] - e.g., "120"
+                - "caption": Optional[str] - e.g., "A melodic piano piece..."
                 - "duration": Optional[str] - e.g., "234"
                 - "keyscale": Optional[str] - e.g., "G major"
+                - "language": Optional[str] - e.g., "en"
                 - "timesignature": Optional[str] - e.g., "4"
                 - "genres": Optional[str] - e.g., "Pop Rock"
                 If None, clears all user-provided metadata.
             metadata = {}
         # Update user-provided metadata
+        for field in ["bpm", "caption", "duration", "keyscale", "language", "timesignature", "genres"]:
             if field in metadata:
                 self.user_provided_metadata[field] = metadata[field]
             else:
         # Note tokens for keyscale (A-G)
         self.note_tokens = {}
+        for note in KEYSCALE_NOTES:
             tokens = self.tokenizer.encode(note, add_special_tokens=False)
             if tokens:
                 self.note_tokens[note] = tokens[-1]
         # EOS token for duration-constrained codes generation
         self.eos_token_id = self.tokenizer.eos_token_id
+        # Period token for caption field transition logic
+        period_tokens = self.tokenizer.encode(".", add_special_tokens=False)
+        self.period_token = period_tokens[-1] if period_tokens else None
+        # Backtick tokens for blocking code blocks in caption
+        backtick_tokens = self.tokenizer.encode("`", add_special_tokens=False)
+        self.backtick_token = backtick_tokens[-1] if backtick_tokens else None
+        # Valid language codes (ISO 639-1 and common variants)
+        self.valid_languages = VALID_LANGUAGES
+        # Precompute audio code token IDs (tokens matching <|audio_code_\d+|>)
+        # These should be blocked during caption generation
+        self.audio_code_token_ids: Set[int] = set()
+        self._precompute_audio_code_tokens()
+        # Precompute audio code mask for efficient blocking (O(1) instead of O(n))
+        # This mask will be added to scores during caption generation
+        self.audio_code_mask: Optional[torch.Tensor] = None
+        self._build_audio_code_mask()
         # Build valid keyscales set (prefix tree will be built after _char_to_tokens is initialized)
         # 7 notes × 5 accidentals (none, #, b, ♯, ♭) × 2 modes = 70 valid combinations
+        self.valid_keyscales = VALID_KEYSCALES.copy()
         # keyscale_prefix_tree will be built in _precompute_char_token_mapping() after _char_to_tokens is ready
         # Numeric prefix trees will be built after field_specs is defined
+    def _precompute_audio_code_tokens(self):
+        """
+        Precompute audio code token IDs (tokens matching <|audio_code_\\d+|>).
+        These tokens should be blocked during caption generation.
+        """
+        import re
+        audio_code_pattern = re.compile(r'^<\|audio_code_\d+\|>$')
+        # Iterate through vocabulary to find audio code tokens
+        for token_id in range(self.vocab_size):
+            try:
+                token_text = self.tokenizer.decode([token_id])
+                if audio_code_pattern.match(token_text):
+                    self.audio_code_token_ids.add(token_id)
+            except Exception:
+                continue
+        if self.debug:
+            logger.debug(f"Found {len(self.audio_code_token_ids)} audio code tokens")
+    def _build_audio_code_mask(self):
+        """
+        Build a precomputed mask tensor for blocking audio code tokens.
+        This mask can be added to scores in O(1) time instead of O(n) loop.
+        The mask is [1, vocab_size] tensor with -inf at audio code token positions.
+        """
+        if not self.audio_code_token_ids:
+            self.audio_code_mask = None
+            return
+        # Create mask tensor: 0 everywhere, -inf at audio code positions
+        # Use float32 for compatibility with most model dtypes
+        mask = torch.zeros(1, self.vocab_size, dtype=torch.float32)
+        # Convert set to list for indexing
+        audio_code_indices = list(self.audio_code_token_ids)
+        # Set -inf at audio code token positions
+        mask[0, audio_code_indices] = float('-inf')
+        self.audio_code_mask = mask
+        if self.debug:
+            logger.debug(f"Built audio code mask for {len(self.audio_code_token_ids)} tokens")
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
         return prefix_to_tokens
+    def _build_language_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
+        """
+        Build language prefix to allowed tokens mapping based on ACTUAL tokenization.
+        Similar to keyscale prefix tree but for language codes.
+        Uses token ID sequences as keys, NOT strings, to avoid tokenization mismatches.
+        """
+        prefix_to_tokens: Dict[Tuple[int, ...], Set[int]] = {}
+        context_prefix_for_matching = "language:"
+        context_prefix_for_tokenization = "language: "
+        context_token_ids = self.tokenizer.encode(context_prefix_for_matching, add_special_tokens=False)
+        if self.debug:
+            context_tokens_str = [self.tokenizer.decode([t]) for t in context_token_ids]
+            logger.debug(f"Context for matching 'language:' tokenizes to {context_token_ids} -> {context_tokens_str}")
+        for lang in self.valid_languages:
+            full_text = context_prefix_for_tokenization + lang
+            full_token_ids = self.tokenizer.encode(full_text, add_special_tokens=False)
+            context_end_idx = None
+            if len(full_token_ids) >= len(context_token_ids):
+                if full_token_ids[:len(context_token_ids)] == context_token_ids:
+                    context_end_idx = len(context_token_ids)
+            if context_end_idx is None:
+                if self.debug:
+                    logger.warning(f"Could not find context prefix in full tokenization of '{full_text}', skipping")
+                continue
+            lang_token_ids = full_token_ids[context_end_idx:]
+            if not lang_token_ids:
+                if self.debug:
+                    logger.warning(f"No tokens extracted for language '{lang}', skipping")
+                continue
+            for i in range(len(lang_token_ids) + 1):
+                token_prefix = tuple(lang_token_ids[:i])
+                if token_prefix not in prefix_to_tokens:
+                    prefix_to_tokens[token_prefix] = set()
+                if i < len(lang_token_ids):
+                    next_token_id = lang_token_ids[i]
+                    prefix_to_tokens[token_prefix].add(next_token_id)
+                else:
+                    if self.newline_token:
+                        prefix_to_tokens[token_prefix].add(self.newline_token)
+        if self.debug:
+            logger.debug(f"Built language prefix tree with {len(prefix_to_tokens)} token sequence prefixes")
+            empty_prefix = tuple()
+            if empty_prefix in prefix_to_tokens:
+                first_tokens = prefix_to_tokens[empty_prefix]
+                decoded_first = [(t, repr(self.tokenizer.decode([t]))) for t in sorted(first_tokens)]
+                logger.debug(f"First tokens allowed for language (empty prefix): {decoded_first}")
+        return prefix_to_tokens
     def diagnose_keyscale_prefix_tree(self):
         """
         Diagnose the keyscale prefix tree to help debug generation bias.
         self.codes_count = 0  # Reset codes counter
         self.user_field_token_queue = []  # Reset user field token queue
         self.current_user_field = None  # Reset current user field
+        self.caption_after_newline = False  # Reset caption newline tracking
+        self.caption_token_count = 0  # Reset caption token count
     def set_target_duration(self, duration: Optional[float]):
         """
             return self.newline_token in self.keyscale_prefix_tree[token_prefix]
         return False
+    def _get_allowed_language_tokens(self) -> List[int]:
+        """
+        Get allowed tokens for language field using the precomputed prefix tree.
+        Uses token ID sequence as key (not string) to avoid tokenization mismatches.
+        Similar to keyscale.
+        """
+        token_prefix = tuple(self.accumulated_token_ids)
+        if token_prefix in self.language_prefix_tree:
+            return list(self.language_prefix_tree[token_prefix])
+        # Fallback: no valid continuation found
+        return []
     def _get_allowed_timesig_tokens(self) -> List[int]:
         """
         Get allowed tokens for timesignature field using the precomputed prefix tree.
         Uses the same tokenization logic as prefix tree building.
         Args:
+            field_name: Field name ("bpm", "caption", "duration", "keyscale", "language", "timesignature", "genres")
         Returns:
             List of token IDs for the complete field, or None if field is not provided
         # Build full field string with space (matching prefix tree tokenization)
         field_to_prefix = {
             "bpm": "bpm: ",
+            "caption": "caption: ",
             "duration": "duration: ",
             "keyscale": "keyscale: ",
+            "language": "language: ",
             "timesignature": "timesignature: ",
             "genres": "genres: ",
         }
             scores = scores + mask
+        elif self.state == FSMState.CAPTION_VALUE:
+            # Caption field generation with YAML format support:
+            # - Allow newlines and spaces (YAML multi-line formatting)
+            # - Block audio codes and backticks
+            # - Max 512 tokens
+            # - Transition when model wants to generate next field (non-indented line)
+            # Check if field is user-provided and we haven't started injecting yet
+            if self.user_provided_metadata["caption"] is not None and not self.user_field_token_queue and not self.accumulated_value:
+                # Initialize token queue with field value tokens (value + newline)
+                value = self.user_provided_metadata["caption"]
+                value_text = f" {value}\n"
+                value_tokens = self.tokenizer.encode(value_text, add_special_tokens=False)
+                if value_tokens:
+                    self.user_field_token_queue = value_tokens
+                    self.current_user_field = "caption"
+                    # Inject first token
+                    mask[0, value_tokens[0]] = 0
+                    scores = scores + mask
+                    return scores
+            # Check if we should transition after a newline (non-indented line = new field)
+            if self.caption_after_newline:
+                # Get top token from current scores
+                top_token_id = torch.argmax(scores[0]).item()
+                top_token_text = self.tokenizer.decode([top_token_id])
+                # If top token does NOT start with space/tab, it's a new field (like "duration:")
+                if len(top_token_text) > 0 and top_token_text[0] not in ' \t':
+                    # Caption is ending, transition to next field
+                    self.caption_after_newline = False
+                    self._transition_to_next_state()
+                    # Process with new state (DURATION_NAME)
+                    return self._process_single_sequence(input_ids, scores)
+                else:
+                    # It's indentation, continue caption
+                    self.caption_after_newline = False
+            # Block backticks (code blocks)
+            if self.backtick_token is not None:
+                scores[0, self.backtick_token] = float('-inf')
+            # Block ALL audio code tokens (critical - these should never appear in caption)
+            # Use precomputed mask for O(1) performance instead of O(n) loop
+            if self.audio_code_mask is not None:
+                # Move mask to same device/dtype as scores if needed
+                if self.audio_code_mask.device != scores.device or self.audio_code_mask.dtype != scores.dtype:
+                    self.audio_code_mask = self.audio_code_mask.to(device=scores.device, dtype=scores.dtype)
+                scores = scores + self.audio_code_mask
+            # Enforce 512 token limit for caption
+            if self.caption_token_count >= 512:
+                # Force end by only allowing newline
+                if self.newline_token is not None:
+                    mask[0, self.newline_token] = 0
+                    scores = scores + mask
+                    return scores
+            # Allow natural generation (with blocked audio codes and backticks)
+            return scores
         elif self.state == FSMState.DURATION_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["duration"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
                         mask[0, self.newline_token] = 0
                     scores = scores + mask
+        elif self.state == FSMState.LANGUAGE_VALUE:
+            # Language field: similar to keyscale, uses prefix tree
+            # Check if field is user-provided and we haven't started injecting yet
+            if self.user_provided_metadata["language"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
+                # Initialize token queue with field value tokens (value + newline)
+                value = self.user_provided_metadata["language"]
+                value_text = f" {value}\n"
+                value_tokens = self.tokenizer.encode(value_text, add_special_tokens=False)
+                if value_tokens:
+                    self.user_field_token_queue = value_tokens
+                    self.current_user_field = "language"
+                    # Inject first token
+                    mask[0, value_tokens[0]] = 0
+                    scores = scores + mask
+                    return scores
+            # Check if current token sequence is complete (allows newline)
+            token_prefix = tuple(self.accumulated_token_ids)
+            if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
+                # Complete language, allow newline
+                if self.newline_token:
+                    mask[0, self.newline_token] = 0
+                scores = scores + mask
+            else:
+                # Not complete, allow valid continuation tokens
+                allowed = self._get_allowed_language_tokens()
+                if allowed:
+                    for t in allowed:
+                        mask[0, t] = 0
+                    scores = scores + mask
+                else:
+                    # No valid tokens found - force newline to end field
+                    if self.newline_token:
+                        mask[0, self.newline_token] = 0
+                    scores = scores + mask
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["timesignature"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
             self.position_in_state = 0
             self.accumulated_value = ""  # Legacy, kept for compatibility
             self.accumulated_token_ids = []  # Reset token ID sequence for new field
+            self.caption_after_newline = False  # Reset caption newline tracking
+            self.caption_token_count = 0  # Reset caption token count
             if self.debug:
                 logger.debug(f"FSM transition: {old_state.name} -> {self.state.name}")
                 # Genres still uses string-based trie, so keep accumulated_value
                 self.accumulated_value += token_str
+        elif self.state == FSMState.CAPTION_VALUE:
+            # Track token count for 512 limit
+            self.caption_token_count += 1
+            # Accumulate caption text
+            self.accumulated_value += token_str
+            # Track if this token is a newline (for transition detection)
+            if generated_token_id == self.newline_token:
+                # Mark that we need to check next token for field transition
+                self.caption_after_newline = True
+            else:
+                # Not a newline - if we were after newline and this is not space,
+                # transition already happened in _process_single_sequence
+                self.caption_after_newline = False
         elif self.state == FSMState.KEYSCALE_VALUE:
             if generated_token_id == self.newline_token:
                 # Newline ends the field
                 self.accumulated_token_ids.append(generated_token_id)
                 # Also update legacy accumulated_value for compatibility
                 self.accumulated_value += token_str
+        elif self.state == FSMState.LANGUAGE_VALUE:
+            if generated_token_id == self.newline_token:
+                # Newline ends the field
+                self._transition_to_next_state()
+                if self.state in self.fixed_strings:
+                    return
+            else:
+                # Add token ID to sequence (for prefix tree lookup)
+                self.accumulated_token_ids.append(generated_token_id)
+                # Also update legacy accumulated_value for compatibility
+                self.accumulated_value += token_str

acestep/gradio_ui.py CHANGED Viewed

@@ -8,6 +8,14 @@ import random
 import glob
 import gradio as gr
 from typing import Callable, Optional, Tuple
 def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None) -> gr.Blocks:
@@ -296,9 +304,9 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     # Determine initial task_type choices based on default model
                     default_model_lower = (default_model or "").lower()
                     if "turbo" in default_model_lower:
-                        initial_task_choices = ["text2music", "repaint", "cover"]
                     else:
-                        initial_task_choices = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
                     with gr.Row():
                         with gr.Column(scale=2):
@@ -311,15 +319,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                         with gr.Column(scale=8):
                             instruction_display_gen = gr.Textbox(
                                 label="Instruction",
-                                value="Fill the audio semantic mask based on the given conditions:",
                                 interactive=False,
                                 lines=1,
                                 info="Instruction is automatically generated based on task type",
                             )
                     track_name = gr.Dropdown(
-                        choices=["woodwinds", "brass", "fx", "synth", "strings", "percussion",
-                                "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"],
                         value=None,
                         label="Track Name",
                         info="Select track name for lego/extract tasks",
@@ -327,8 +334,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     )
                     complete_track_classes = gr.CheckboxGroup(
-                        choices=["woodwinds", "brass", "fx", "synth", "strings", "percussion",
-                                "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"],
                         label="Track Names",
                         info="Select multiple track classes for complete task",
                         visible=False
@@ -410,8 +416,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                 with gr.Accordion("⚙️ Optional Parameters", open=True):
                     with gr.Row():
                         vocal_language = gr.Dropdown(
-                            choices=["en", "zh", "ja", "ko", "es", "fr", "de"],
-                            value="en",
                             label="Vocal Language (optional)",
                             allow_custom_value=True,
                             info="use `unknown` for inst"
@@ -567,6 +573,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     scale=2,
                 )
             with gr.Row():
                 audio_cover_strength = gr.Slider(
                     minimum=0.0,
@@ -625,6 +645,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
         "lm_top_k": lm_top_k,
         "lm_top_p": lm_top_p,
         "lm_negative_prompt": lm_negative_prompt,
         "repainting_group": repainting_group,
         "repainting_start": repainting_start,
         "repainting_end": repainting_end,
@@ -824,7 +846,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 gr.update(visible=False),  # use_adg
                 gr.update(visible=False),  # cfg_interval_start
                 gr.update(visible=False),  # cfg_interval_end
-                gr.update(choices=["text2music", "repaint", "cover"]),  # task_type
             )
         elif "base" in config_path_lower:
             # Base model: max 100 steps, show CFG/ADG, show all task types
@@ -834,7 +856,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 gr.update(visible=True),  # use_adg
                 gr.update(visible=True),  # cfg_interval_start
                 gr.update(visible=True),  # cfg_interval_end
-                gr.update(choices=["text2music", "repaint", "cover", "extract", "lego", "complete"]),  # task_type
             )
         else:
             # Default to turbo settings
@@ -844,7 +866,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 gr.update(visible=False),
                 gr.update(visible=False),
                 gr.update(visible=False),
-                gr.update(choices=["text2music", "repaint", "cover"]),  # task_type
             )
     generation_section["config_path"].change(
@@ -965,6 +987,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         instruction_display_gen, audio_cover_strength, task_type,
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         progress=gr.Progress(track_tqdm=True)
     ):
         # If think is enabled (llm_dit mode), generate audio codes using LM first
@@ -1019,6 +1042,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 top_k=top_k_value,
                 top_p=top_p_value,
                 user_metadata=user_metadata_to_pass,
             )
             # Store LM-generated metadata and audio codes for display
@@ -1076,14 +1101,18 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             metadata_lines = []
             if lm_generated_metadata.get('bpm'):
                 metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
-            if lm_generated_metadata.get('keyscale'):
-                metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
-            if lm_generated_metadata.get('timesignature'):
-                metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
             if lm_generated_metadata.get('duration'):
                 metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
             if lm_generated_metadata.get('genres'):
                 metadata_lines.append(f"- **Genres:** {lm_generated_metadata['genres']}")
             if metadata_lines:
                 metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
@@ -1140,7 +1169,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["lm_cfg_scale"],
             generation_section["lm_top_k"],
             generation_section["lm_top_p"],
-            generation_section["lm_negative_prompt"]
         ],
         outputs=[
             results_section["generated_audio_1"],

 import glob
 import gradio as gr
 from typing import Callable, Optional, Tuple
+from acestep.constants import (
+    VALID_LANGUAGES,
+    TRACK_NAMES,
+    TASK_TYPES,
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+    DEFAULT_DIT_INSTRUCTION,
+)
 def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None) -> gr.Blocks:
                     # Determine initial task_type choices based on default model
                     default_model_lower = (default_model or "").lower()
                     if "turbo" in default_model_lower:
+                        initial_task_choices = TASK_TYPES_TURBO
                     else:
+                        initial_task_choices = TASK_TYPES_BASE
                     with gr.Row():
                         with gr.Column(scale=2):
                         with gr.Column(scale=8):
                             instruction_display_gen = gr.Textbox(
                                 label="Instruction",
+                                value=DEFAULT_DIT_INSTRUCTION,
                                 interactive=False,
                                 lines=1,
                                 info="Instruction is automatically generated based on task type",
                             )
                     track_name = gr.Dropdown(
+                        choices=TRACK_NAMES,
                         value=None,
                         label="Track Name",
                         info="Select track name for lego/extract tasks",
                     )
                     complete_track_classes = gr.CheckboxGroup(
+                        choices=TRACK_NAMES,
                         label="Track Names",
                         info="Select multiple track classes for complete task",
                         visible=False
                 with gr.Accordion("⚙️ Optional Parameters", open=True):
                     with gr.Row():
                         vocal_language = gr.Dropdown(
+                            choices=VALID_LANGUAGES,
+                            value="unknown",
                             label="Vocal Language (optional)",
                             allow_custom_value=True,
                             info="use `unknown` for inst"
                     scale=2,
                 )
+            with gr.Row():
+                use_cot_caption = gr.Checkbox(
+                    label="CoT Caption",
+                    value=True,
+                    info="Generate caption in CoT (chain-of-thought)",
+                    scale=1,
+                )
+                use_cot_language = gr.Checkbox(
+                    label="CoT Language",
+                    value=True,
+                    info="Generate language in CoT (chain-of-thought)",
+                    scale=1,
+                )
             with gr.Row():
                 audio_cover_strength = gr.Slider(
                     minimum=0.0,
         "lm_top_k": lm_top_k,
         "lm_top_p": lm_top_p,
         "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
         "repainting_group": repainting_group,
         "repainting_start": repainting_start,
         "repainting_end": repainting_end,
                 gr.update(visible=False),  # use_adg
                 gr.update(visible=False),  # cfg_interval_start
                 gr.update(visible=False),  # cfg_interval_end
+                gr.update(choices=TASK_TYPES_TURBO),  # task_type
             )
         elif "base" in config_path_lower:
             # Base model: max 100 steps, show CFG/ADG, show all task types
                 gr.update(visible=True),  # use_adg
                 gr.update(visible=True),  # cfg_interval_start
                 gr.update(visible=True),  # cfg_interval_end
+                gr.update(choices=TASK_TYPES_BASE),  # task_type
             )
         else:
             # Default to turbo settings
                 gr.update(visible=False),
                 gr.update(visible=False),
                 gr.update(visible=False),
+                gr.update(choices=TASK_TYPES_TURBO),  # task_type
             )
     generation_section["config_path"].change(
         instruction_display_gen, audio_cover_strength, task_type,
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+        use_cot_caption, use_cot_language,
         progress=gr.Progress(track_tqdm=True)
     ):
         # If think is enabled (llm_dit mode), generate audio codes using LM first
                 top_k=top_k_value,
                 top_p=top_p_value,
                 user_metadata=user_metadata_to_pass,
+                use_cot_caption=use_cot_caption,
+                use_cot_language=use_cot_language,
             )
             # Store LM-generated metadata and audio codes for display
             metadata_lines = []
             if lm_generated_metadata.get('bpm'):
                 metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
+            if lm_generated_metadata.get('caption'):
+                metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
             if lm_generated_metadata.get('duration'):
                 metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
             if lm_generated_metadata.get('genres'):
                 metadata_lines.append(f"- **Genres:** {lm_generated_metadata['genres']}")
+            if lm_generated_metadata.get('keyscale'):
+                metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
+            if lm_generated_metadata.get('language'):
+                metadata_lines.append(f"- **Language:** {lm_generated_metadata['language']}")
+            if lm_generated_metadata.get('timesignature'):
+                metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
             if metadata_lines:
                 metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
             generation_section["lm_cfg_scale"],
             generation_section["lm_top_k"],
             generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"]
         ],
         outputs=[
             results_section["generated_audio_1"],

acestep/handler.py CHANGED Viewed

@@ -23,6 +23,11 @@ import warnings
 from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
 from transformers.generation.streamers import BaseStreamer
 from diffusers.models import AutoencoderOobleck
 warnings.filterwarnings("ignore")
@@ -519,10 +524,11 @@ class AceStepHandler:
         Args:
             task: Task name (e.g., text2music, cover, repaint); kept for logging/future branching.
             instruction: Instruction text; default fallback matches service_generate behavior.
-            caption: Caption string.
             lyrics: Lyrics string.
             metas: Metadata (str or dict); follows _parse_metas formatting.
-            vocal_language: Language code for lyrics section.
         Returns:
             (caption_input_text, lyrics_input_text)
@@ -533,18 +539,45 @@ class AceStepHandler:
                 instruction=None,
                 caption="A calm piano melody",
                 lyrics="la la la",
-                metas={"bpm": 90, "duration": 45},
                 vocal_language="en",
             )
         """
         # Align instruction formatting with _prepare_batch
-        final_instruction = instruction or "Fill the audio semantic mask based on the given conditions:"
         if not final_instruction.endswith(":"):
             final_instruction = final_instruction + ":"
         parsed_meta = self._parse_metas([metas])[0]
-        caption_input = SFT_GEN_PROMPT.format(final_instruction, caption, parsed_meta)
-        lyrics_input = f"# Languages\n{vocal_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -679,41 +712,36 @@ class AceStepHandler:
         track_name: Optional[str] = None,
         complete_track_classes: Optional[List[str]] = None
     ) -> str:
-        TRACK_NAMES = [
-            "woodwinds", "brass", "fx", "synth", "strings", "percussion",
-            "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
-        ]
         if task_type == "text2music":
-            return "Fill the audio semantic mask based on the given conditions:"
         elif task_type == "repaint":
-            return "Repaint the mask area based on the given conditions:"
         elif task_type == "cover":
-            return "Generate audio semantic tokens based on the given conditions:"
         elif task_type == "extract":
             if track_name:
                 # Convert to uppercase
                 track_name_upper = track_name.upper()
-                return f"Extract the {track_name_upper} track from the audio:"
             else:
-                return "Extract the track from the audio:"
         elif task_type == "lego":
             if track_name:
                 # Convert to uppercase
                 track_name_upper = track_name.upper()
-                return f"Generate the {track_name_upper} track based on the audio context:"
             else:
-                return "Generate the track based on the audio context:"
         elif task_type == "complete":
             if complete_track_classes and len(complete_track_classes) > 0:
                 # Convert to uppercase and join with " | "
                 track_classes_upper = [t.upper() for t in complete_track_classes]
                 complete_track_classes_str = " | ".join(track_classes_upper)
-                return f"Complete the input track with {complete_track_classes_str}:"
             else:
-                return "Complete the input track:"
         else:
-            return "Fill the audio semantic mask based on the given conditions:"
     def process_reference_audio(self, audio_file) -> Optional[torch.Tensor]:
         if audio_file is None:
@@ -1247,7 +1275,7 @@ class AceStepHandler:
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
         if instructions is None:
-            instructions = ["Fill the audio semantic mask based on the given conditions:"] * batch_size
         # Ensure instructions list has the same length as batch_size
         if len(instructions) != batch_size:
@@ -1257,7 +1285,7 @@ class AceStepHandler:
                 # Pad or truncate to match batch_size
                 instructions = instructions[:batch_size]
                 while len(instructions) < batch_size:
-                    instructions.append("Fill the audio semantic mask based on the given conditions:")
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
@@ -1415,13 +1443,29 @@ class AceStepHandler:
         for i in range(batch_size):
             # Use custom instruction for this batch item
-            instruction = instructions[i] if i < len(instructions) else "Fill the audio semantic mask based on the given conditions:"
             # Ensure instruction ends with ":"
             if not instruction.endswith(":"):
                 instruction = instruction + ":"
-            # Format text prompt with custom instruction
-            text_prompt = SFT_GEN_PROMPT.format(instruction, captions[i], parsed_metas[i])
             # Tokenize text
             text_inputs_dict = self.text_tokenizer(
@@ -1434,8 +1478,8 @@ class AceStepHandler:
             text_token_ids = text_inputs_dict.input_ids[0]
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
-            # Format and tokenize lyrics
-            lyrics_text = f"# Languages\n{vocal_languages[i]}\n\n# Lyric\n{lyrics[i]}<|endoftext|>"
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
@@ -1495,10 +1539,17 @@ class AceStepHandler:
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
-                instruction = "Fill the audio semantic mask based on the given conditions:"
-                # Format text prompt with custom instruction
-                text_prompt = SFT_GEN_PROMPT.format(instruction, captions[i], parsed_metas[i])
                 # Tokenize text
                 text_inputs_dict = self.text_tokenizer(
@@ -1991,7 +2042,7 @@ class AceStepHandler:
         audio_code_string: Union[str, List[str]] = "",
         repainting_start: float = 0.0,
         repainting_end: Optional[float] = None,
-        instruction: str = "Fill the audio semantic mask based on the given conditions:",
         audio_cover_strength: float = 1.0,
         task_type: str = "text2music",
         use_adg: bool = False,
@@ -2030,7 +2081,7 @@ class AceStepHandler:
                 # User has provided audio codes, switch to cover task
                 task_type = "cover"
                 # Update instruction for cover task
-                instruction = "Generate audio semantic tokens based on the given conditions:"
         logger.info("[generate_music] Starting generation...")
         if progress:

 from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
 from transformers.generation.streamers import BaseStreamer
 from diffusers.models import AutoencoderOobleck
+from acestep.constants import (
+    TASK_INSTRUCTIONS,
+    TRACK_NAMES,
+    DEFAULT_DIT_INSTRUCTION,
+)
 warnings.filterwarnings("ignore")
         Args:
             task: Task name (e.g., text2music, cover, repaint); kept for logging/future branching.
             instruction: Instruction text; default fallback matches service_generate behavior.
+            caption: Caption string (fallback if not in metas).
             lyrics: Lyrics string.
             metas: Metadata (str or dict); follows _parse_metas formatting.
+                   May contain 'caption' and 'language' fields from LM CoT output.
+            vocal_language: Language code for lyrics section (fallback if not in metas).
         Returns:
             (caption_input_text, lyrics_input_text)
                 instruction=None,
                 caption="A calm piano melody",
                 lyrics="la la la",
+                metas={"bpm": 90, "duration": 45, "caption": "LM generated caption", "language": "en"},
                 vocal_language="en",
             )
         """
         # Align instruction formatting with _prepare_batch
+        final_instruction = instruction or DEFAULT_DIT_INSTRUCTION
         if not final_instruction.endswith(":"):
             final_instruction = final_instruction + ":"
+        # Extract caption and language from metas if available (from LM CoT output)
+        # Fallback to user-provided values if not in metas
+        actual_caption = caption
+        actual_language = vocal_language
+        if metas is not None:
+            # Parse metas to dict if it's a string
+            if isinstance(metas, str):
+                # Try to parse as dict-like string or use as-is
+                parsed_metas = self._parse_metas([metas])
+                if parsed_metas and isinstance(parsed_metas[0], dict):
+                    meta_dict = parsed_metas[0]
+                else:
+                    meta_dict = {}
+            elif isinstance(metas, dict):
+                meta_dict = metas
+            else:
+                meta_dict = {}
+            # Extract caption from metas if available
+            if 'caption' in meta_dict and meta_dict['caption']:
+                actual_caption = str(meta_dict['caption'])
+            # Extract language from metas if available
+            if 'language' in meta_dict and meta_dict['language']:
+                actual_language = str(meta_dict['language'])
         parsed_meta = self._parse_metas([metas])[0]
+        caption_input = SFT_GEN_PROMPT.format(final_instruction, actual_caption, parsed_meta)
+        lyrics_input = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         track_name: Optional[str] = None,
         complete_track_classes: Optional[List[str]] = None
     ) -> str:
         if task_type == "text2music":
+            return TASK_INSTRUCTIONS["text2music"]
         elif task_type == "repaint":
+            return TASK_INSTRUCTIONS["repaint"]
         elif task_type == "cover":
+            return TASK_INSTRUCTIONS["cover"]
         elif task_type == "extract":
             if track_name:
                 # Convert to uppercase
                 track_name_upper = track_name.upper()
+                return TASK_INSTRUCTIONS["extract"].format(TRACK_NAME=track_name_upper)
             else:
+                return TASK_INSTRUCTIONS["extract_default"]
         elif task_type == "lego":
             if track_name:
                 # Convert to uppercase
                 track_name_upper = track_name.upper()
+                return TASK_INSTRUCTIONS["lego"].format(TRACK_NAME=track_name_upper)
             else:
+                return TASK_INSTRUCTIONS["lego_default"]
         elif task_type == "complete":
             if complete_track_classes and len(complete_track_classes) > 0:
                 # Convert to uppercase and join with " | "
                 track_classes_upper = [t.upper() for t in complete_track_classes]
                 complete_track_classes_str = " | ".join(track_classes_upper)
+                return TASK_INSTRUCTIONS["complete"].format(TRACK_CLASSES=complete_track_classes_str)
             else:
+                return TASK_INSTRUCTIONS["complete_default"]
         else:
+            return TASK_INSTRUCTIONS["text2music"]
     def process_reference_audio(self, audio_file) -> Optional[torch.Tensor]:
         if audio_file is None:
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
         if instructions is None:
+            instructions = [DEFAULT_DIT_INSTRUCTION] * batch_size
         # Ensure instructions list has the same length as batch_size
         if len(instructions) != batch_size:
                 # Pad or truncate to match batch_size
                 instructions = instructions[:batch_size]
                 while len(instructions) < batch_size:
+                    instructions.append(DEFAULT_DIT_INSTRUCTION)
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
         for i in range(batch_size):
             # Use custom instruction for this batch item
+            instruction = instructions[i] if i < len(instructions) else DEFAULT_DIT_INSTRUCTION
             # Ensure instruction ends with ":"
             if not instruction.endswith(":"):
                 instruction = instruction + ":"
+            # Extract caption and language from metas if available (from LM CoT output)
+            # Fallback to user-provided values if not in metas
+            actual_caption = captions[i]
+            actual_language = vocal_languages[i]
+            # Check if metas contains caption/language from LM CoT
+            if i < len(parsed_metas) and parsed_metas[i]:
+                meta_dict = parsed_metas[i]
+                if isinstance(meta_dict, dict):
+                    # Extract caption from metas if available
+                    if 'caption' in meta_dict and meta_dict['caption']:
+                        actual_caption = str(meta_dict['caption'])
+                    # Extract language from metas if available
+                    if 'language' in meta_dict and meta_dict['language']:
+                        actual_language = str(meta_dict['language'])
+            # Format text prompt with custom instruction (using LM-generated caption if available)
+            text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
             # Tokenize text
             text_inputs_dict = self.text_tokenizer(
             text_token_ids = text_inputs_dict.input_ids[0]
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
+            # Format and tokenize lyrics (using LM-generated language if available)
+            lyrics_text = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics[i]}<|endoftext|>"
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
+                instruction = DEFAULT_DIT_INSTRUCTION
+                # Extract caption from metas if available (from LM CoT output)
+                actual_caption = captions[i]
+                if i < len(parsed_metas) and parsed_metas[i]:
+                    meta_dict = parsed_metas[i]
+                    if isinstance(meta_dict, dict) and 'caption' in meta_dict and meta_dict['caption']:
+                        actual_caption = str(meta_dict['caption'])
+                # Format text prompt with custom instruction (using LM-generated caption if available)
+                text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
                 # Tokenize text
                 text_inputs_dict = self.text_tokenizer(
         audio_code_string: Union[str, List[str]] = "",
         repainting_start: float = 0.0,
         repainting_end: Optional[float] = None,
+        instruction: str = DEFAULT_DIT_INSTRUCTION,
         audio_cover_strength: float = 1.0,
         task_type: str = "text2music",
         use_adg: bool = False,
                 # User has provided audio codes, switch to cover task
                 task_type = "cover"
                 # Update instruction for cover task
+                instruction = TASK_INSTRUCTIONS["cover"]
         logger.info("[generate_music] Starting generation...")
         if progress:

acestep/llm_inference.py CHANGED Viewed

@@ -17,6 +17,7 @@ from transformers.generation.logits_process import (
     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
 class LLMHandler:
@@ -244,6 +245,8 @@ class LLMHandler:
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
     ) -> str:
         """Shared vllm path: accept prebuilt formatted prompt and return text."""
         from nanovllm import SamplingParams
@@ -265,6 +268,9 @@ class LLMHandler:
             # Always call set_user_metadata to ensure previous settings are cleared if None
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             constrained_processor = self.constrained_processor
@@ -318,6 +324,8 @@ class LLMHandler:
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
     ) -> str:
         """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
         inputs = self.llm_tokenizer(
@@ -338,6 +346,9 @@ class LLMHandler:
             # Always call set_user_metadata to ensure previous settings are cleared if None
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             constrained_processor = self.constrained_processor
@@ -472,6 +483,8 @@ class LLMHandler:
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
     ) -> Tuple[Dict[str, Any], str, str]:
         """Feishu-compatible LM generation.
@@ -483,6 +496,8 @@ class LLMHandler:
                             5 codes = 1 second. If specified, blocks EOS until target reached.
             user_metadata: User-provided metadata fields (e.g. bpm/duration/keyscale/timesignature).
                            If specified, constrained decoding will inject these values directly.
         """
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
@@ -509,6 +524,8 @@ class LLMHandler:
                     "repetition_penalty": repetition_penalty,
                     "target_duration": target_duration,
                     "user_metadata": user_metadata,
                 },
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
@@ -540,7 +557,7 @@ class LLMHandler:
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
         return self.llm_tokenizer.apply_chat_template(
             [
-                {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
                 {"role": "user", "content": prompt},
             ],
             tokenize=False,
@@ -591,6 +608,8 @@ class LLMHandler:
         repetition_penalty = cfg.get("repetition_penalty", 1.0)
         target_duration = cfg.get("target_duration")
         user_metadata = cfg.get("user_metadata")  # User-provided metadata fields
         try:
             if self.llm_backend == "vllm":
@@ -607,6 +626,8 @@ class LLMHandler:
                     target_duration=target_duration,
                     user_metadata=user_metadata,
                     stop_at_reasoning=stop_at_reasoning,
                 )
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
@@ -624,6 +645,8 @@ class LLMHandler:
                 target_duration=target_duration,
                 user_metadata=user_metadata,
                 stop_at_reasoning=stop_at_reasoning,
             )
             return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
@@ -928,9 +951,11 @@ class LLMHandler:
         Expected format:
         <think>
         bpm: 73
         duration: 273
         genres: Chinese folk
         keyscale: G major
         timesignature: 4
         </think>
@@ -973,32 +998,69 @@ class LLMHandler:
             lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
             reasoning_text = lines_before_codes.strip()
-        # Parse metadata fields
         if reasoning_text:
-            for line in reasoning_text.split('\n'):
-                line = line.strip()
-                if ':' in line and not line.startswith('<'):
                     parts = line.split(':', 1)
                     if len(parts) == 2:
-                        key = parts[0].strip().lower()
-                        value = parts[1].strip()
-                        if key == 'bpm':
-                            try:
-                                metadata['bpm'] = int(value)
-                            except:
-                                metadata['bpm'] = value
-                        elif key == 'duration':
-                            try:
-                                metadata['duration'] = int(value)
-                            except:
-                                metadata['duration'] = value
-                        elif key == 'genres':
-                            metadata['genres'] = value
-                        elif key == 'keyscale':
-                            metadata['keyscale'] = value
-                        elif key == 'timesignature':
-                            metadata['timesignature'] = value
         return metadata, audio_codes

     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
+from acestep.constants import DEFAULT_LM_INSTRUCTION
 class LLMHandler:
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
+        skip_caption: bool = False,
+        skip_language: bool = False,
     ) -> str:
         """Shared vllm path: accept prebuilt formatted prompt and return text."""
         from nanovllm import SamplingParams
             # Always call set_user_metadata to ensure previous settings are cleared if None
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
+            # Set skip_caption and skip_language based on flags
+            self.constrained_processor.set_skip_caption(skip_caption)
+            self.constrained_processor.set_skip_language(skip_language)
             constrained_processor = self.constrained_processor
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
+        skip_caption: bool = False,
+        skip_language: bool = False,
     ) -> str:
         """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
         inputs = self.llm_tokenizer(
             # Always call set_user_metadata to ensure previous settings are cleared if None
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
+            # Set skip_caption and skip_language based on flags
+            self.constrained_processor.set_skip_caption(skip_caption)
+            self.constrained_processor.set_skip_language(skip_language)
             constrained_processor = self.constrained_processor
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        use_cot_caption: bool = True,
+        use_cot_language: bool = True,
     ) -> Tuple[Dict[str, Any], str, str]:
         """Feishu-compatible LM generation.
                             5 codes = 1 second. If specified, blocks EOS until target reached.
             user_metadata: User-provided metadata fields (e.g. bpm/duration/keyscale/timesignature).
                            If specified, constrained decoding will inject these values directly.
+            use_cot_caption: Whether to generate caption in CoT (default True).
+            use_cot_language: Whether to generate language in CoT (default True).
         """
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
                     "repetition_penalty": repetition_penalty,
                     "target_duration": target_duration,
                     "user_metadata": user_metadata,
+                    "skip_caption": not use_cot_caption,
+                    "skip_language": not use_cot_language,
                 },
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
         return self.llm_tokenizer.apply_chat_template(
             [
+                {"role": "system", "content": f"# Instruction\n{DEFAULT_LM_INSTRUCTION}\n\n"},
                 {"role": "user", "content": prompt},
             ],
             tokenize=False,
         repetition_penalty = cfg.get("repetition_penalty", 1.0)
         target_duration = cfg.get("target_duration")
         user_metadata = cfg.get("user_metadata")  # User-provided metadata fields
+        skip_caption = cfg.get("skip_caption", False)  # Skip caption generation in CoT
+        skip_language = cfg.get("skip_language", False)  # Skip language generation in CoT
         try:
             if self.llm_backend == "vllm":
                     target_duration=target_duration,
                     user_metadata=user_metadata,
                     stop_at_reasoning=stop_at_reasoning,
+                    skip_caption=skip_caption,
+                    skip_language=skip_language,
                 )
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
                 target_duration=target_duration,
                 user_metadata=user_metadata,
                 stop_at_reasoning=stop_at_reasoning,
+                skip_caption=skip_caption,
+                skip_language=skip_language,
             )
             return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
         Expected format:
         <think>
         bpm: 73
+        caption: A calm piano melody
         duration: 273
         genres: Chinese folk
         keyscale: G major
+        language: en
         timesignature: 4
         </think>
             lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
             reasoning_text = lines_before_codes.strip()
+        # Parse metadata fields with YAML multi-line value support
         if reasoning_text:
+            lines = reasoning_text.split('\n')
+            current_key = None
+            current_value_lines = []
+            def save_current_field():
+                """Save the accumulated field value"""
+                nonlocal current_key, current_value_lines
+                if current_key and current_value_lines:
+                    # Join multi-line value
+                    value = '\n'.join(current_value_lines)
+                    if current_key == 'bpm':
+                        try:
+                            metadata['bpm'] = int(value.strip())
+                        except:
+                            metadata['bpm'] = value.strip()
+                    elif current_key == 'caption':
+                        # Post-process caption to remove YAML multi-line formatting
+                        metadata['caption'] = MetadataConstrainedLogitsProcessor.postprocess_caption(value)
+                    elif current_key == 'duration':
+                        try:
+                            metadata['duration'] = int(value.strip())
+                        except:
+                            metadata['duration'] = value.strip()
+                    elif current_key == 'genres':
+                        metadata['genres'] = value.strip()
+                    elif current_key == 'keyscale':
+                        metadata['keyscale'] = value.strip()
+                    elif current_key == 'language':
+                        metadata['language'] = value.strip()
+                    elif current_key == 'timesignature':
+                        metadata['timesignature'] = value.strip()
+                current_key = None
+                current_value_lines = []
+            for line in lines:
+                # Skip lines starting with '<' (tags)
+                if line.strip().startswith('<'):
+                    continue
+                # Check if this is a new field (no leading spaces and contains ':')
+                if line and not line[0].isspace() and ':' in line:
+                    # Save previous field if any
+                    save_current_field()
+                    # Parse new field
                     parts = line.split(':', 1)
                     if len(parts) == 2:
+                        current_key = parts[0].strip().lower()
+                        # First line of value (after colon)
+                        first_value = parts[1]
+                        if first_value.strip():
+                            current_value_lines.append(first_value)
+                elif line.startswith(' ') or line.startswith('\t'):
+                    # Continuation line (YAML multi-line value)
+                    if current_key:
+                        current_value_lines.append(line)
+            # Don't forget to save the last field
+            save_current_field()
         return metadata, audio_codes