Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Jan 14

Commit

2b1ad1c

1 Parent(s): 2d3816e

test rewrite format

Browse files

Files changed (20) hide show

.gitignore +2 -1
acestep/gradio_ui/events/__init__.py +32 -0
acestep/gradio_ui/events/generation_handlers.py +143 -40
acestep/gradio_ui/events/results_handlers.py +13 -3
acestep/gradio_ui/i18n/en.json +6 -2
acestep/gradio_ui/i18n/ja.json +6 -2
acestep/gradio_ui/i18n/zh.json +6 -2
acestep/gradio_ui/interfaces/generation.py +37 -18
acestep/inference.py +179 -19
acestep/llm_inference.py +214 -22
examples/simple_mode/example_01.json +1 -1
examples/simple_mode/example_02.json +1 -1
examples/simple_mode/example_03.json +1 -1
examples/simple_mode/example_04.json +1 -1
examples/simple_mode/example_05.json +1 -1
examples/simple_mode/example_06.json +1 -1
examples/simple_mode/example_07.json +1 -1
examples/simple_mode/example_08.json +1 -1
examples/simple_mode/example_09.json +1 -1
examples/simple_mode/example_10.json +1 -1

.gitignore CHANGED Viewed

@@ -220,4 +220,5 @@ discord_bot/
 feishu_bot/
 tmp*
 torchinductor_root/
-scripts/

 feishu_bot/
 tmp*
 torchinductor_root/
+scripts/
+checkpoints_legacy/

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -190,6 +190,37 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         outputs=[generation_section["lyrics"]]
     )
     # ========== Simple/Custom Mode Toggle ==========
     generation_section["generation_mode"].change(
         fn=gen_h.handle_generation_mode_change,
@@ -245,6 +276,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["audio_duration"],
             generation_section["key_scale"],
             generation_section["vocal_language"],
             generation_section["time_signature"],
             generation_section["instrumental_checkbox"],
             generation_section["caption_accordion"],

         outputs=[generation_section["lyrics"]]
     )
+    # ========== Format Button ==========
+    # Note: cfg_scale and negative_prompt are not supported in format mode
+    generation_section["format_btn"].click(
+        fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
+            llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
+        ),
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
     # ========== Simple/Custom Mode Toggle ==========
     generation_section["generation_mode"].change(
         fn=gen_h.handle_generation_mode_change,
             generation_section["audio_duration"],
             generation_section["key_scale"],
             generation_section["vocal_language"],
+            generation_section["simple_vocal_language"],
             generation_section["time_signature"],
             generation_section["instrumental_checkbox"],
             generation_section["caption_accordion"],

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -13,7 +13,7 @@ from acestep.constants import (
     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
-from acestep.inference import understand_music, create_sample
 def load_metadata(file_obj):
@@ -256,7 +256,7 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
 def load_random_simple_description():
     """Load a random description from the simple_mode examples directory.
     Returns:
         Tuple of (description, instrumental, vocal_language) for updating UI components
     """
@@ -265,39 +265,39 @@ def load_random_simple_description():
         current_file = os.path.abspath(__file__)
         # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
         project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
         # Construct the examples directory path
         examples_dir = os.path.join(project_root, "examples", "simple_mode")
         # Check if directory exists
         if not os.path.exists(examples_dir):
             gr.Warning(t("messages.simple_examples_not_found"))
             return gr.update(), gr.update(), gr.update()
         # Find all JSON files in the directory
         json_files = glob.glob(os.path.join(examples_dir, "*.json"))
         if not json_files:
             gr.Warning(t("messages.simple_examples_empty"))
             return gr.update(), gr.update(), gr.update()
         # Randomly select one file
         selected_file = random.choice(json_files)
         # Read and parse JSON
         try:
             with open(selected_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             # Extract fields
             description = data.get('description', '')
             instrumental = data.get('instrumental', False)
-            vocal_language = data.get('vocal_language', ['unknown'])
-            # Ensure vocal_language is a list
-            if isinstance(vocal_language, str):
-                vocal_language = [vocal_language]
             gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
             return description, instrumental, vocal_language
@@ -564,7 +564,7 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
 def handle_simple_instrumental_change(is_instrumental: bool):
     """
     Handle simple mode instrumental checkbox changes.
-    When checked: set vocal_language to ["unknown"] and disable editing.
     When unchecked: enable vocal_language editing.
     Args:
@@ -574,7 +574,7 @@ def handle_simple_instrumental_change(is_instrumental: bool):
         gr.update for simple_vocal_language dropdown
     """
     if is_instrumental:
-        return gr.update(value=["unknown"], interactive=False)
     else:
         return gr.update(interactive=True)
@@ -653,7 +653,7 @@ def handle_create_sample(
     llm_handler,
     query: str,
     instrumental: bool,
-    vocal_language: list,
     lm_temperature: float,
     lm_top_k: int,
     lm_top_p: float,
@@ -671,7 +671,7 @@ def handle_create_sample(
         llm_handler: LLM handler instance
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
-        vocal_language: List of preferred vocal languages for constrained decoding
         lm_temperature: LLM temperature for generation
         lm_top_k: LLM top-k sampling
         lm_top_p: LLM top-p sampling
@@ -695,27 +695,6 @@ def handle_create_sample(
         - is_format_caption_state (True)
         - status_output
     """
-    # Validate query
-    if not query or not query.strip():
-        gr.Warning(t("messages.empty_query"))
-        return (
-            gr.update(),  # captions - no change
-            gr.update(),  # lyrics - no change
-            gr.update(),  # bpm - no change
-            gr.update(),  # audio_duration - no change
-            gr.update(),  # key_scale - no change
-            gr.update(),  # vocal_language - no change
-            gr.update(),  # time_signature - no change
-            gr.update(),  # instrumental_checkbox - no change
-            gr.update(),  # caption_accordion - no change
-            gr.update(),  # lyrics_accordion - no change
-            gr.update(interactive=False),  # generate_btn - keep disabled
-            False,  # simple_sample_created - still False
-            gr.update(),  # think_checkbox - no change
-            gr.update(),  # is_format_caption_state - no change
-            t("messages.empty_query"),  # status_output
-        )
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         gr.Warning(t("messages.lm_not_initialized"))
@@ -765,6 +744,7 @@ def handle_create_sample(
             gr.update(),  # audio_duration - no change
             gr.update(),  # key_scale - no change
             gr.update(),  # vocal_language - no change
             gr.update(),  # time_signature - no change
             gr.update(),  # instrumental_checkbox - no change
             gr.update(),  # caption_accordion - no change
@@ -786,6 +766,7 @@ def handle_create_sample(
         result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
         result.keyscale,  # key_scale
         result.language,  # vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
         gr.update(open=True),  # caption_accordion - expand
@@ -798,3 +779,125 @@ def handle_create_sample(
     )

     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music, create_sample, format_sample
 def load_metadata(file_obj):
 def load_random_simple_description():
     """Load a random description from the simple_mode examples directory.
     Returns:
         Tuple of (description, instrumental, vocal_language) for updating UI components
     """
         current_file = os.path.abspath(__file__)
         # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
         project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
         # Construct the examples directory path
         examples_dir = os.path.join(project_root, "examples", "simple_mode")
         # Check if directory exists
         if not os.path.exists(examples_dir):
             gr.Warning(t("messages.simple_examples_not_found"))
             return gr.update(), gr.update(), gr.update()
         # Find all JSON files in the directory
         json_files = glob.glob(os.path.join(examples_dir, "*.json"))
         if not json_files:
             gr.Warning(t("messages.simple_examples_empty"))
             return gr.update(), gr.update(), gr.update()
         # Randomly select one file
         selected_file = random.choice(json_files)
         # Read and parse JSON
         try:
             with open(selected_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             # Extract fields
             description = data.get('description', '')
             instrumental = data.get('instrumental', False)
+            vocal_language = data.get('vocal_language', 'unknown')
+            # Ensure vocal_language is a string
+            if isinstance(vocal_language, list):
+                vocal_language = vocal_language[0] if vocal_language else 'unknown'
             gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
             return description, instrumental, vocal_language
 def handle_simple_instrumental_change(is_instrumental: bool):
     """
     Handle simple mode instrumental checkbox changes.
+    When checked: set vocal_language to "unknown" and disable editing.
     When unchecked: enable vocal_language editing.
     Args:
         gr.update for simple_vocal_language dropdown
     """
     if is_instrumental:
+        return gr.update(value="unknown", interactive=False)
     else:
         return gr.update(interactive=True)
     llm_handler,
     query: str,
     instrumental: bool,
+    vocal_language: str,
     lm_temperature: float,
     lm_top_k: int,
     lm_top_p: float,
         llm_handler: LLM handler instance
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
+        vocal_language: Preferred vocal language for constrained decoding
         lm_temperature: LLM temperature for generation
         lm_top_k: LLM top-k sampling
         lm_top_p: LLM top-p sampling
         - is_format_caption_state (True)
         - status_output
     """
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         gr.Warning(t("messages.lm_not_initialized"))
             gr.update(),  # audio_duration - no change
             gr.update(),  # key_scale - no change
             gr.update(),  # vocal_language - no change
+            gr.update(),  # simple vocal_language - no change
             gr.update(),  # time_signature - no change
             gr.update(),  # instrumental_checkbox - no change
             gr.update(),  # caption_accordion - no change
         result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
         result.keyscale,  # key_scale
         result.language,  # vocal_language
+        result.language,  # simple vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
         gr.update(open=True),  # caption_accordion - expand
     )
+def handle_format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    bpm,
+    audio_duration,
+    key_scale: str,
+    time_signature: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Format button click to format caption and lyrics.
+    Takes user-provided caption and lyrics, and uses the LLM to generate
+    structured music metadata and an enhanced description.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: LLM handler instance
+        caption: User's caption/description
+        lyrics: User's lyrics
+        bpm: User-provided BPM (optional, for constrained decoding)
+        audio_duration: User-provided duration (optional, for constrained decoding)
+        key_scale: User-provided key scale (optional, for constrained decoding)
+        time_signature: User-provided time signature (optional, for constrained decoding)
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - is_format_caption_state
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Build user_metadata from provided values for constrained decoding
+    user_metadata = {}
+    if bpm is not None and bpm > 0:
+        user_metadata['bpm'] = int(bpm)
+    if audio_duration is not None and audio_duration > 0:
+        user_metadata['duration'] = int(audio_duration)
+    if key_scale and key_scale.strip():
+        user_metadata['keyscale'] = key_scale.strip()
+    if time_signature and time_signature.strip():
+        user_metadata['timesignature'] = time_signature.strip()
+    # Only pass user_metadata if we have at least one field
+    user_metadata_to_pass = user_metadata if user_metadata else None
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call format_sample API
+    result = format_sample(
+        llm_handler=llm_handler,
+        caption=caption,
+        lyrics=lyrics,
+        user_metadata=user_metadata_to_pass,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.format_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.format_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.format_success"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.timesignature,  # time_signature
+        True,  # is_format_caption_state - True (LM-formatted)
+        result.status_message,  # status_output
+    )

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -465,6 +465,14 @@ def generate_with_progress(
 ):
     """Generate audio with progress tracking"""
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
@@ -496,7 +504,7 @@ def generate_with_progress(
         lm_top_k=lm_top_k,
         lm_top_p=lm_top_p,
         lm_negative_prompt=lm_negative_prompt,
-        use_cot_metas=use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
         use_constrained_decoding=True,
@@ -587,7 +595,7 @@ def generate_with_progress(
     # Clear lrc_display with empty string - this triggers .change() to clear subtitles
     clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
     clear_accordions = [gr.skip() for _ in range(8)]  # Don't change accordion visibility
-    dump_audio = [None for _ in range(8)]
     yield (
         # Audio outputs - just skip, value will be updated in loop
         # Subtitles will be cleared via lrc_display.change()
@@ -1682,6 +1690,8 @@ def generate_next_batch_background(
         # Call generate_with_progress with the saved parameters
         # Note: generate_with_progress is a generator, need to iterate through it
         generator = generate_with_progress(
             dit_handler,
             llm_handler,
@@ -1719,7 +1729,7 @@ def generate_next_batch_background(
             use_cot_metas=params.get("use_cot_metas"),
             use_cot_caption=params.get("use_cot_caption"),
             use_cot_language=params.get("use_cot_language"),
-            is_format_caption=is_format_caption,
             constrained_decoding_debug=params.get("constrained_decoding_debug"),
             allow_lm_batch=params.get("allow_lm_batch"),
             auto_score=params.get("auto_score"),

 ):
     """Generate audio with progress tracking"""
+    # Skip Phase 1 metas COT if sample is already formatted (from LLM/file/random)
+    # This avoids redundant LLM calls since metas (bpm, keyscale, etc.) are already generated
+    actual_use_cot_metas = use_cot_metas
+    if is_format_caption and use_cot_metas:
+        actual_use_cot_metas = False
+        logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
+        gr.Info(t("messages.skipping_metas_cot"))
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
         lm_top_k=lm_top_k,
         lm_top_p=lm_top_p,
         lm_negative_prompt=lm_negative_prompt,
+        use_cot_metas=actual_use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
         use_constrained_decoding=True,
     # Clear lrc_display with empty string - this triggers .change() to clear subtitles
     clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
     clear_accordions = [gr.skip() for _ in range(8)]  # Don't change accordion visibility
+    dump_audio = [gr.update(value="", subtitles="") for _ in range(8)]
     yield (
         # Audio outputs - just skip, value will be updated in loop
         # Subtitles will be cleared via lrc_display.change()
         # Call generate_with_progress with the saved parameters
         # Note: generate_with_progress is a generator, need to iterate through it
+        # For AutoGen background batches, always skip metas COT since we want to
+        # generate NEW audio codes with new seeds, not regenerate the same metas
         generator = generate_with_progress(
             dit_handler,
             llm_handler,
             use_cot_metas=params.get("use_cot_metas"),
             use_cot_caption=params.get("use_cot_caption"),
             use_cot_language=params.get("use_cot_language"),
+            is_format_caption=is_format_caption,  # Pass through - will skip metas COT if True
             constrained_decoding_debug=params.get("constrained_decoding_debug"),
             allow_lm_batch=params.get("allow_lm_batch"),
             auto_score=params.get("auto_score"),

acestep/gradio_ui/i18n/en.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "Simple",
     "mode_custom": "Custom",
     "simple_query_label": "Song Description",
-    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
     "simple_query_info": "Enter a natural language description of the music you want to generate",
     "simple_vocal_language_label": "Vocal Language (optional)",
     "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
     "lyrics_info": "Song lyrics with structure",
     "instrumental_label": "Instrumental",
     "optional_params": "⚙️ Optional Parameters",
     "vocal_language_label": "Vocal Language (optional)",
     "vocal_language_info": "use `unknown` for inst",
@@ -227,6 +228,9 @@
     "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
     "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
     "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
-    "simple_example_loaded": "🎲 Loaded random example from {filename}"
   }
 }

     "mode_simple": "Simple",
     "mode_custom": "Custom",
     "simple_query_label": "Song Description",
+    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
     "simple_query_info": "Enter a natural language description of the music you want to generate",
     "simple_vocal_language_label": "Vocal Language (optional)",
     "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
     "lyrics_info": "Song lyrics with structure",
     "instrumental_label": "Instrumental",
+    "format_btn": "Format",
     "optional_params": "⚙️ Optional Parameters",
     "vocal_language_label": "Vocal Language (optional)",
     "vocal_language_info": "use `unknown` for inst",
     "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
     "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
     "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
+    "simple_example_loaded": "🎲 Loaded random example from {filename}",
+    "format_success": "✅ Caption and lyrics formatted successfully",
+    "format_failed": "❌ Format failed: {error}",
+    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
   }
 }

acestep/gradio_ui/i18n/ja.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "シンプル",
     "mode_custom": "カスタム",
     "simple_query_label": "曲の説明",
-    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'",
     "simple_query_info": "生成したい音楽の自然言語の説明を入力",
     "simple_vocal_language_label": "ボーカル言語(オプション)",
     "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
     "lyrics_info": "構造を持つ曲の歌詞",
     "instrumental_label": "インストゥルメンタル",
     "optional_params": "⚙️ オプションパラメータ",
     "vocal_language_label": "ボーカル言語(オプション)",
     "vocal_language_info": "インストには`unknown`を使用",
@@ -227,6 +228,9 @@
     "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
     "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
     "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
-    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
   }
 }

     "mode_simple": "シンプル",
     "mode_custom": "カスタム",
     "simple_query_label": "曲の説明",
+    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
     "simple_query_info": "生成したい音楽の自然言語の説明を入力",
     "simple_vocal_language_label": "ボーカル言語(オプション)",
     "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
     "lyrics_info": "構造を持つ曲の歌詞",
     "instrumental_label": "インストゥルメンタル",
+    "format_btn": "フォーマット",
     "optional_params": "⚙️ オプションパラメータ",
     "vocal_language_label": "ボーカル言語(オプション)",
     "vocal_language_info": "インストには`unknown`を使用",
     "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
     "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
     "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
+    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
+    "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
+    "format_failed": "❌ フォーマットに失敗しました: {error}",
+    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）"
   }
 }

acestep/gradio_ui/i18n/zh.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "简单",
     "mode_custom": "自定义",
     "simple_query_label": "歌曲描述",
-    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'",
     "simple_query_info": "输入你想生成的音乐的自然语言描述",
     "simple_vocal_language_label": "人声语言(可选)",
     "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
     "lyrics_info": "带有结构的歌曲歌词",
     "instrumental_label": "纯音乐",
     "optional_params": "⚙️ 可选参数",
     "vocal_language_label": "人声语言(可选)",
     "vocal_language_info": "纯音乐使用 `unknown`",
@@ -227,6 +228,9 @@
     "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
     "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
     "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
-    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
   }
 }

     "mode_simple": "简单",
     "mode_custom": "自定义",
     "simple_query_label": "歌曲描述",
+    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'。留空则随机生成样本。",
     "simple_query_info": "输入你想生成的音乐的自然语言描述",
     "simple_vocal_language_label": "人声语言(可选)",
     "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
     "lyrics_info": "带有结构的歌曲歌词",
     "instrumental_label": "纯音乐",
+    "format_btn": "格式化",
     "optional_params": "⚙️ 可选参数",
     "vocal_language_label": "人声语言(可选)",
     "vocal_language_info": "纯音乐使用 `unknown`",
     "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
     "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
     "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
+    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
+    "format_success": "✅ 描述和歌词格式化成功",
+    "format_failed": "❌ 格式化失败: {error}",
+    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）"
   }
 }

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -314,15 +314,15 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                             placeholder=t("generation.caption_placeholder"),
                             lines=3,
                             info=t("generation.caption_info"),
-                            scale=9,
                         )
-                        sample_btn = gr.Button(
-                            "🎲",
-                            variant="secondary",
-                            size="sm",
-                            scale=1,
-                        )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
                 with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
@@ -331,22 +331,40 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                         lines=8,
                         info=t("generation.lyrics_info")
                     )
-                    instrumental_checkbox = gr.Checkbox(
-                        label=t("generation.instrumental_label"),
-                        value=False,
-                        scale=1,
-                    )
-                # Optional Parameters
-                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
-                    with gr.Row():
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
                             value="unknown",
                             label=t("generation.vocal_language_label"),
                             allow_custom_value=True,
-                            info=t("generation.vocal_language_info")
                         )
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
                             value=None,
@@ -679,6 +697,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,
         "instrumental_checkbox": instrumental_checkbox,
         "constrained_decoding_debug": constrained_decoding_debug,
         "score_scale": score_scale,
         "allow_lm_batch": allow_lm_batch,

                             placeholder=t("generation.caption_placeholder"),
                             lines=3,
                             info=t("generation.caption_info"),
+                            scale=12,
                         )
+                        with gr.Column(scale=1, min_width=100):
+                            sample_btn = gr.Button(
+                                "🎲",
+                                variant="secondary",
+                                size="sm",
+                                scale=2,
+                            )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
                 with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
                         lines=8,
                         info=t("generation.lyrics_info")
                     )
+                    with gr.Row(variant="compact", equal_height=True):
+                        instrumental_checkbox = gr.Checkbox(
+                            label=t("generation.instrumental_label"),
+                            value=False,
+                            scale=1,
+                            min_width=120,
+                            container=True,
+                        )
+                        # 中间：语言选择 (Dropdown)
+                        # 移除 gr.HTML hack，直接使用 label 参数，Gradio 会自动处理对齐
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
                             value="unknown",
                             label=t("generation.vocal_language_label"),
+                            show_label=False,
+                            container=True,
                             allow_custom_value=True,
+                            scale=3,
+                        )
+                        # 右侧：格式化按钮 (Button)
+                        # 放在同一行最右侧，操作更顺手
+                        format_btn = gr.Button(
+                            t("generation.format_btn"),
+                            variant="secondary",
+                            scale=1,
+                            min_width=80,
                         )
+                # Optional Parameters
+                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
+                    with gr.Row():
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
                             value=None,
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,
         "instrumental_checkbox": instrumental_checkbox,
+        "format_btn": format_btn,
         "constrained_decoding_debug": constrained_decoding_debug,
         "score_scale": score_scale,
         "allow_lm_batch": allow_lm_batch,

acestep/inference.py CHANGED Viewed

@@ -671,8 +671,6 @@ def understand_music(
     llm_handler,
     audio_codes: str,
     temperature: float = 0.85,
-    cfg_scale: float = 1.0,
-    negative_prompt: str = "NO USER INPUT",
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
     repetition_penalty: float = 1.0,
@@ -687,13 +685,13 @@ def understand_music(
     If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
     instead of analyzing existing codes.
     Args:
         llm_handler: Initialized LLM handler (LLMHandler instance)
         audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
                      Use empty string or "NO USER INPUT" to generate a sample example.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
-        cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
-        negative_prompt: Negative prompt for CFG guidance
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -727,8 +725,6 @@ def understand_music(
         metadata, status = llm_handler.understand_audio_from_codes(
             audio_codes=audio_codes,
             temperature=temperature,
-            cfg_scale=cfg_scale,
-            negative_prompt=negative_prompt,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
@@ -847,7 +843,7 @@ def create_sample(
     llm_handler,
     query: str,
     instrumental: bool = False,
-    vocal_language: Optional[List[str]] = None,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
@@ -869,9 +865,9 @@ def create_sample(
         llm_handler: Initialized LLM handler (LLMHandler instance)
         query: User's natural language music description (e.g., "a soft Bengali love song")
         instrumental: Whether to generate instrumental music (no vocals)
-        vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
-                       If provided, the model will be constrained to generate lyrics in these languages.
-                       If None or ["unknown"], no language constraint is applied.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
@@ -883,7 +879,7 @@ def create_sample(
         CreateSampleResult with generated sample fields and status
     Example:
-        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
         >>> if result.success:
         ...     print(f"Caption: {result.caption}")
         ...     print(f"Lyrics: {result.lyrics}")
@@ -897,14 +893,6 @@ def create_sample(
             error="LLM not initialized",
         )
-    # Validate query
-    if not query or not query.strip():
-        return CreateSampleResult(
-            status_message="No query provided. Please enter a music description.",
-            success=False,
-            error="Empty query",
-        )
     try:
         # Call LLM to create sample
         metadata, status = llm_handler.create_sample_from_query(
@@ -982,3 +970,175 @@ def create_sample(
             success=False,
             error=str(e),
         )

     llm_handler,
     audio_codes: str,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
     repetition_penalty: float = 1.0,
     If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
     instead of analyzing existing codes.
+    Note: cfg_scale and negative_prompt are not supported in understand mode.
     Args:
         llm_handler: Initialized LLM handler (LLMHandler instance)
         audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
                      Use empty string or "NO USER INPUT" to generate a sample example.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         repetition_penalty: Repetition penalty (1.0 = no penalty)
         metadata, status = llm_handler.understand_audio_from_codes(
             audio_codes=audio_codes,
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
     llm_handler,
     query: str,
     instrumental: bool = False,
+    vocal_language: Optional[str] = None,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
         llm_handler: Initialized LLM handler (LLMHandler instance)
         query: User's natural language music description (e.g., "a soft Bengali love song")
         instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                       If provided, the model will be constrained to generate lyrics in this language.
+                       If None or "unknown", no language constraint is applied.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         CreateSampleResult with generated sample fields and status
     Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
         >>> if result.success:
         ...     print(f"Caption: {result.caption}")
         ...     print(f"Lyrics: {result.lyrics}")
             error="LLM not initialized",
         )
     try:
         # Call LLM to create sample
         metadata, status = llm_handler.create_sample_from_query(
             success=False,
             error=str(e),
         )
+@dataclass
+class FormatSampleResult:
+    """Result of formatting user-provided caption and lyrics.
+    This is used by the "Format" feature where users provide caption and lyrics,
+    and the LLM formats them into structured music metadata and an enhanced description.
+    Attributes:
+        # Metadata Fields
+        caption: Enhanced/formatted music description/caption
+        lyrics: Formatted lyrics (may be same as input or reformatted)
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        # Status
+        status_message: Status message from formatting
+        success: Whether formatting completed successfully
+        error: Error message if formatting failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult:
+    """Format user-provided caption and lyrics using the 5Hz Language Model.
+    This function takes user input (caption and lyrics) and generates structured
+    music metadata including an enhanced caption, BPM, duration, key, language,
+    and time signature.
+    If user_metadata is provided, those values will be used to constrain the
+    decoding, ensuring the output matches user-specified values.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        caption: User's caption/description (e.g., "Latin pop, reggaeton")
+        lyrics: User's lyrics with structure tags
+        user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                      Supported keys: bpm, duration, keyscale, timesignature, language
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        FormatSampleResult with formatted metadata fields and status
+    Example:
+        >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return FormatSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM formatting
+        metadata, status = llm_handler.format_sample_from_input(
+            caption=caption,
+            lyrics=lyrics,
+            user_metadata=user_metadata,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return FormatSampleResult(
+                status_message=status or "Failed to format input",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        result_caption = metadata.get('caption', '')
+        result_lyrics = metadata.get('lyrics', lyrics)  # Fall back to input lyrics
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return FormatSampleResult(
+            caption=result_caption,
+            lyrics=result_lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Format sample failed")
+        return FormatSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

acestep/llm_inference.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
-from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
 class LLMHandler:
@@ -1296,8 +1296,6 @@ class LLMHandler:
         self,
         audio_codes: str,
         temperature: float = 0.3,
-        cfg_scale: float = 1.0,
-        negative_prompt: str = "NO USER INPUT",
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         repetition_penalty: float = 1.0,
@@ -1306,16 +1304,16 @@ class LLMHandler:
     ) -> Tuple[Dict[str, Any], str]:
         """
         Understand audio codes and generate metadata + lyrics.
         This is the reverse of the normal generation flow:
         - Input: Audio codes
         - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
         Args:
             audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
             temperature: Sampling temperature for generation
-            cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
-            negative_prompt: Negative prompt for CFG
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
             repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -1352,12 +1350,11 @@ class LLMHandler:
         print(f"formatted_prompt: {formatted_prompt}")
         # Generate using constrained decoding (understand phase)
         # We want to generate metadata first (CoT), then lyrics (natural text)
         output_text, status = self.generate_from_formatted_prompt(
             formatted_prompt=formatted_prompt,
             cfg={
                 "temperature": temperature,
-                "cfg_scale": cfg_scale,
-                "negative_prompt": negative_prompt,
                 "top_k": top_k,
                 "top_p": top_p,
                 "repetition_penalty": repetition_penalty,
@@ -1491,7 +1488,7 @@ class LLMHandler:
         self,
         query: str,
         instrumental: bool = False,
-        vocal_language: Optional[List[str]] = None,
         temperature: float = 0.85,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
@@ -1509,8 +1506,8 @@ class LLMHandler:
         Args:
             query: User's natural language music description
             instrumental: Whether to generate instrumental music (no vocals)
-            vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
-                           If provided and not ["unknown"], the first language will be used.
             temperature: Sampling temperature for generation (0.0-2.0)
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
@@ -1532,7 +1529,7 @@ class LLMHandler:
         Example:
             query = "a soft Bengali love song for a quiet evening"
-            metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
             print(metadata['caption'])  # "A gentle romantic acoustic pop ballad..."
             print(metadata['lyrics'])   # "[Intro: ...]\\n..."
         """
@@ -1540,7 +1537,7 @@ class LLMHandler:
             return {}, "❌ 5Hz LM not initialized. Please initialize it first."
         if not query or not query.strip():
-            return {}, "❌ No query provided. Please enter a music description."
         logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
@@ -1554,14 +1551,11 @@ class LLMHandler:
         # Build user_metadata if vocal_language is specified and is not "unknown"
         user_metadata = None
         skip_language = False
-        if vocal_language and len(vocal_language) > 0:
-            # Filter out "unknown" from the list
-            valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
-            if valid_languages:
-                # Use the first valid language for constrained decoding
-                user_metadata = {"language": valid_languages[0]}
-                skip_language = True  # Skip language generation since we're injecting it
-                logger.info(f"Using user-specified language: {valid_languages[0]}")
         # Generate using constrained decoding (inspiration phase)
         # Similar to understand mode - generate metadata first (CoT), then lyrics
@@ -1612,6 +1606,204 @@ class LLMHandler:
         status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
         return metadata, status_msg
     def generate_from_formatted_prompt(
         self,
         formatted_prompt: str,

     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
+from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION
 class LLMHandler:
         self,
         audio_codes: str,
         temperature: float = 0.3,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         repetition_penalty: float = 1.0,
     ) -> Tuple[Dict[str, Any], str]:
         """
         Understand audio codes and generate metadata + lyrics.
         This is the reverse of the normal generation flow:
         - Input: Audio codes
         - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
+        Note: cfg_scale and negative_prompt are not supported in understand mode.
         Args:
             audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
             temperature: Sampling temperature for generation
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
             repetition_penalty: Repetition penalty (1.0 = no penalty)
         print(f"formatted_prompt: {formatted_prompt}")
         # Generate using constrained decoding (understand phase)
         # We want to generate metadata first (CoT), then lyrics (natural text)
+        # Note: cfg_scale and negative_prompt are not used in understand mode
         output_text, status = self.generate_from_formatted_prompt(
             formatted_prompt=formatted_prompt,
             cfg={
                 "temperature": temperature,
                 "top_k": top_k,
                 "top_p": top_p,
                 "repetition_penalty": repetition_penalty,
         self,
         query: str,
         instrumental: bool = False,
+        vocal_language: Optional[str] = None,
         temperature: float = 0.85,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         Args:
             query: User's natural language music description
             instrumental: Whether to generate instrumental music (no vocals)
+            vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                           If provided and not "unknown", it will be used.
             temperature: Sampling temperature for generation (0.0-2.0)
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
         Example:
             query = "a soft Bengali love song for a quiet evening"
+            metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language="bn")
             print(metadata['caption'])  # "A gentle romantic acoustic pop ballad..."
             print(metadata['lyrics'])   # "[Intro: ...]\\n..."
         """
             return {}, "❌ 5Hz LM not initialized. Please initialize it first."
         if not query or not query.strip():
+            query = "NO USER INPUT"
         logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
         # Build user_metadata if vocal_language is specified and is not "unknown"
         user_metadata = None
         skip_language = False
+        if vocal_language and vocal_language.strip() and vocal_language.strip().lower() != "unknown":
+            # Use the specified language for constrained decoding
+            user_metadata = {"language": vocal_language.strip()}
+            skip_language = True  # Skip language generation since we're injecting it
+            logger.info(f"Using user-specified language: {vocal_language.strip()}")
         # Generate using constrained decoding (inspiration phase)
         # Similar to understand mode - generate metadata first (CoT), then lyrics
         status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
         return metadata, status_msg
+    def build_formatted_prompt_for_format(
+        self,
+        caption: str,
+        lyrics: str,
+        is_negative_prompt: bool = False,
+        negative_prompt: str = "NO USER INPUT"
+    ) -> str:
+        """
+        Build the chat-formatted prompt for format/rewrite mode.
+        This formats user-provided caption and lyrics into a more detailed and specific
+        musical description with metadata.
+        Args:
+            caption: User's caption/description of the music
+            lyrics: User's lyrics
+            is_negative_prompt: If True, builds unconditional prompt for CFG
+            negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
+        Returns:
+            Formatted prompt string
+        Example:
+            caption = "Latin pop, reggaeton, flamenco-pop"
+            lyrics = "[Verse 1]\\nTengo un nudo..."
+            prompt = handler.build_formatted_prompt_for_format(caption, lyrics)
+        """
+        if self.llm_tokenizer is None:
+            raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
+        if is_negative_prompt:
+            # For CFG unconditional prompt
+            user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
+        else:
+            # Normal prompt: caption + lyrics
+            user_content = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}"
+        return self.llm_tokenizer.apply_chat_template(
+            [
+                {
+                    "role": "system",
+                    "content": f"# Instruction\n{DEFAULT_LM_REWRITE_INSTRUCTION}\n\n"
+                },
+                {
+                    "role": "user",
+                    "content": user_content
+                },
+            ],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    def format_sample_from_input(
+        self,
+        caption: str,
+        lyrics: str,
+        user_metadata: Optional[Dict[str, Any]] = None,
+        temperature: float = 0.85,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+    ) -> Tuple[Dict[str, Any], str]:
+        """
+        Format user-provided caption and lyrics into structured music metadata.
+        This is the "Format" feature that takes user input and generates:
+        - Enhanced caption with detailed music description
+        - Metadata (bpm, duration, keyscale, language, timesignature)
+        - Formatted lyrics (preserved from input)
+        Note: cfg_scale and negative_prompt are not supported in format mode.
+        Args:
+            caption: User's caption/description (e.g., "Latin pop, reggaeton")
+            lyrics: User's lyrics with structure tags
+            user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                          Supported keys: bpm, duration, keyscale, timesignature, language
+            temperature: Sampling temperature for generation (0.0-2.0)
+            top_k: Top-K sampling (None = disabled)
+            top_p: Top-P (nucleus) sampling (None = disabled)
+            repetition_penalty: Repetition penalty (1.0 = no penalty)
+            use_constrained_decoding: Whether to use FSM-based constrained decoding
+            constrained_decoding_debug: Whether to enable debug logging
+        Returns:
+            Tuple of (metadata_dict, status_message)
+            metadata_dict contains:
+                - bpm: int or str
+                - caption: str (enhanced)
+                - duration: int or str
+                - keyscale: str
+                - language: str
+                - timesignature: str
+                - lyrics: str (from input, possibly formatted)
+        Example:
+            caption = "Latin pop, reggaeton, flamenco-pop"
+            lyrics = "[Verse 1]\\nTengo un nudo en la garganta..."
+            metadata, status = handler.format_sample_from_input(caption, lyrics)
+            print(metadata['caption'])  # "A dramatic and powerful Latin pop track..."
+            print(metadata['bpm'])      # 100
+        """
+        if not getattr(self, "llm_initialized", False):
+            return {}, "❌ 5Hz LM not initialized. Please initialize it first."
+        if not caption or not caption.strip():
+            caption = "NO USER INPUT"
+        if not lyrics or not lyrics.strip():
+            lyrics = "[Instrumental]"
+        logger.info(f"Formatting sample from input: caption={caption[:50]}..., lyrics length={len(lyrics)}")
+        # Build formatted prompt for format task
+        formatted_prompt = self.build_formatted_prompt_for_format(
+            caption=caption,
+            lyrics=lyrics,
+        )
+        logger.debug(f"Formatted prompt for format: {formatted_prompt}")
+        # Build constrained decoding metadata from user_metadata
+        constrained_metadata = None
+        if user_metadata:
+            constrained_metadata = {}
+            if user_metadata.get('bpm') is not None:
+                try:
+                    bpm_val = int(user_metadata['bpm'])
+                    if bpm_val > 0:
+                        constrained_metadata['bpm'] = bpm_val
+                except (ValueError, TypeError):
+                    pass
+            if user_metadata.get('duration') is not None:
+                try:
+                    dur_val = int(user_metadata['duration'])
+                    if dur_val > 0:
+                        constrained_metadata['duration'] = dur_val
+                except (ValueError, TypeError):
+                    pass
+            if user_metadata.get('keyscale'):
+                constrained_metadata['keyscale'] = user_metadata['keyscale']
+            if user_metadata.get('timesignature'):
+                constrained_metadata['timesignature'] = user_metadata['timesignature']
+            if user_metadata.get('language'):
+                constrained_metadata['language'] = user_metadata['language']
+            # Only use if we have at least one field
+            if not constrained_metadata:
+                constrained_metadata = None
+            else:
+                logger.info(f"Using user-provided metadata constraints: {constrained_metadata}")
+        # Generate using constrained decoding (format phase)
+        # Similar to understand/inspiration mode - generate metadata first (CoT), then formatted lyrics
+        # Note: cfg_scale and negative_prompt are not used in format mode
+        output_text, status = self.generate_from_formatted_prompt(
+            formatted_prompt=formatted_prompt,
+            cfg={
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "repetition_penalty": repetition_penalty,
+                "target_duration": None,  # No duration constraint for generation length
+                "user_metadata": constrained_metadata,  # Inject user-provided metadata
+                "skip_caption": False,  # Generate caption
+                "skip_language": constrained_metadata.get('language') is not None if constrained_metadata else False,
+                "skip_genres": False,  # Generate genres
+                "generation_phase": "understand",  # Use understand phase for metadata + free-form lyrics
+                "caption": "",
+                "lyrics": "",
+            },
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            stop_at_reasoning=False,  # Continue after </think> to get formatted lyrics
+        )
+        if not output_text:
+            return {}, status
+        # Parse metadata and extract lyrics
+        metadata, _ = self.parse_lm_output(output_text)
+        # Extract formatted lyrics section (everything after </think>)
+        formatted_lyrics = self._extract_lyrics_from_output(output_text)
+        if formatted_lyrics:
+            metadata['lyrics'] = formatted_lyrics
+        else:
+            # If no lyrics generated, keep original input
+            metadata['lyrics'] = lyrics
+        logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
+        if constrained_decoding_debug:
+            logger.debug(f"Generated metadata: {list(metadata.keys())}")
+            logger.debug(f"Output text preview: {output_text[:300]}...")
+        status_msg = f"✅ Format completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
+        return metadata, status_msg
     def generate_from_formatted_prompt(
         self,
         formatted_prompt: str,

examples/simple_mode/example_01.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "a soft Bengali love song for a quiet evening",
     "instrumental": false,
-    "vocal_language": ["bn"]
 }

 {
     "description": "a soft Bengali love song for a quiet evening",
     "instrumental": false,
+    "vocal_language": "bn"
 }

examples/simple_mode/example_02.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "an upbeat summer pop song with catchy hooks",
     "instrumental": false,
-    "vocal_language": ["en"]
 }

 {
     "description": "an upbeat summer pop song with catchy hooks",
     "instrumental": false,
+    "vocal_language": "en"
 }

examples/simple_mode/example_03.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "epic orchestral cinematic music for a movie trailer",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "epic orchestral cinematic music for a movie trailer",
     "instrumental": true,
+    "vocal_language": "unknown"
 }

examples/simple_mode/example_04.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "一首深情的中文抒情歌曲，适合夜晚独自聆听",
     "instrumental": false,
-    "vocal_language": ["zh"]
 }

 {
     "description": "一首深情的中文抒情歌曲，适合夜晚独自聆听",
     "instrumental": false,
+    "vocal_language": "zh"
 }

examples/simple_mode/example_05.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "Japanese city pop with nostalgic 80s vibes",
     "instrumental": false,
-    "vocal_language": ["ja"]
 }

 {
     "description": "Japanese city pop with nostalgic 80s vibes",
     "instrumental": false,
+    "vocal_language": "ja"
 }

examples/simple_mode/example_06.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "lo-fi hip hop beats for studying and relaxing",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "lo-fi hip hop beats for studying and relaxing",
     "instrumental": true,
+    "vocal_language": "unknown"
 }

examples/simple_mode/example_07.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "energetic K-pop dance track with powerful vocals",
     "instrumental": false,
-    "vocal_language": ["ko"]
 }

 {
     "description": "energetic K-pop dance track with powerful vocals",
     "instrumental": false,
+    "vocal_language": "ko"
 }

examples/simple_mode/example_08.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "romantic Spanish guitar ballad with heartfelt lyrics",
     "instrumental": false,
-    "vocal_language": ["es"]
 }

 {
     "description": "romantic Spanish guitar ballad with heartfelt lyrics",
     "instrumental": false,
+    "vocal_language": "es"
 }

examples/simple_mode/example_09.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "中国风电子舞曲，融合古典乐器与现代节拍",
     "instrumental": false,
-    "vocal_language": ["zh"]
 }

 {
     "description": "中国风电子舞曲，融合古典乐器与现代节拍",
     "instrumental": false,
+    "vocal_language": "zh"
 }

examples/simple_mode/example_10.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "peaceful piano melody for meditation and relaxation",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "peaceful piano melody for meditation and relaxation",
     "instrumental": true,
+    "vocal_language": "unknown"
 }