Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

Gong Junmin commited on Jan 14

Commit

2d3816e

unverified ·

2 Parent(s): d984ea0 4a86c5f

Merge pull request #6 from ace-step/add_simple_mode

Browse files

Files changed (18) hide show

acestep/gradio_ui/events/__init__.py +69 -2
acestep/gradio_ui/events/generation_handlers.py +267 -1
acestep/gradio_ui/i18n/en.json +17 -2
acestep/gradio_ui/i18n/ja.json +17 -2
acestep/gradio_ui/i18n/zh.json +17 -2
acestep/gradio_ui/interfaces/generation.py +74 -6
acestep/inference.py +185 -0
acestep/llm_inference.py +181 -2
examples/simple_mode/example_01.json +5 -0
examples/simple_mode/example_02.json +5 -0
examples/simple_mode/example_03.json +5 -0
examples/simple_mode/example_04.json +5 -0
examples/simple_mode/example_05.json +5 -0
examples/simple_mode/example_06.json +5 -0
examples/simple_mode/example_07.json +5 -0
examples/simple_mode/example_08.json +5 -0
examples/simple_mode/example_09.json +5 -0
examples/simple_mode/example_10.json +5 -0

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -121,11 +121,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         )
     # ========== Sample/Transcribe Handlers ==========
     generation_section["sample_btn"].click(
-        fn=lambda task, debug: gen_h.sample_example_smart(llm_handler, task, debug) + (True,),
         inputs=[
             generation_section["task_type"],
-            generation_section["constrained_decoding_debug"]
         ],
         outputs=[
             generation_section["captions"],
@@ -190,6 +190,73 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         outputs=[generation_section["lyrics"]]
     )
     # ========== Load/Save Metadata ==========
     generation_section["load_file"].upload(
         fn=gen_h.load_metadata,

         )
     # ========== Sample/Transcribe Handlers ==========
+    # Load random example from ./examples/text2music directory
     generation_section["sample_btn"].click(
+        fn=lambda task: gen_h.load_random_example(task) + (True,),
         inputs=[
             generation_section["task_type"],
         ],
         outputs=[
             generation_section["captions"],
         outputs=[generation_section["lyrics"]]
     )
+    # ========== Simple/Custom Mode Toggle ==========
+    generation_section["generation_mode"].change(
+        fn=gen_h.handle_generation_mode_change,
+        inputs=[generation_section["generation_mode"]],
+        outputs=[
+            generation_section["simple_mode_group"],
+            generation_section["caption_accordion"],
+            generation_section["lyrics_accordion"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["optional_params_accordion"],
+        ]
+    )
+    # ========== Simple Mode Instrumental Checkbox ==========
+    # When instrumental is checked, disable vocal language and set to ["unknown"]
+    generation_section["simple_instrumental_checkbox"].change(
+        fn=gen_h.handle_simple_instrumental_change,
+        inputs=[generation_section["simple_instrumental_checkbox"]],
+        outputs=[generation_section["simple_vocal_language"]]
+    )
+    # ========== Random Description Button ==========
+    generation_section["random_desc_btn"].click(
+        fn=gen_h.load_random_simple_description,
+        inputs=[],
+        outputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+        ]
+    )
+    # ========== Create Sample Button (Simple Mode) ==========
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    generation_section["create_sample_btn"].click(
+        fn=lambda query, instrumental, vocal_lang, temp, top_k, top_p, debug: gen_h.handle_create_sample(
+            llm_handler, query, instrumental, vocal_lang, temp, top_k, top_p, debug
+        ),
+        inputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            generation_section["instrumental_checkbox"],
+            generation_section["caption_accordion"],
+            generation_section["lyrics_accordion"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["think_checkbox"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
     # ========== Load/Save Metadata ==========
     generation_section["load_file"].upload(
         fn=gen_h.load_metadata,

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -13,7 +13,7 @@ from acestep.constants import (
     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
-from acestep.inference import understand_music
 def load_metadata(file_obj):
@@ -254,6 +254,65 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
         return load_random_example(task_type)
 def refresh_checkpoints(dit_handler):
     """Refresh available checkpoints"""
     choices = dit_handler.get_available_checkpoints()
@@ -502,6 +561,24 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
             return current_lyrics
 def update_audio_components_visibility(batch_size):
     """Show/hide individual audio components based on batch size (1-8)
@@ -532,3 +609,192 @@ def update_audio_components_visibility(batch_size):
     return updates_row1 + updates_row2

     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music, create_sample
 def load_metadata(file_obj):
         return load_random_example(task_type)
+def load_random_simple_description():
+    """Load a random description from the simple_mode examples directory.
+    Returns:
+        Tuple of (description, instrumental, vocal_language) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", "simple_mode")
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(t("messages.simple_examples_not_found"))
+            return gr.update(), gr.update(), gr.update()
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(t("messages.simple_examples_empty"))
+            return gr.update(), gr.update(), gr.update()
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract fields
+            description = data.get('description', '')
+            instrumental = data.get('instrumental', False)
+            vocal_language = data.get('vocal_language', ['unknown'])
+            # Ensure vocal_language is a list
+            if isinstance(vocal_language, str):
+                vocal_language = [vocal_language]
+            gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
+            return description, instrumental, vocal_language
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return gr.update(), gr.update(), gr.update()
 def refresh_checkpoints(dit_handler):
     """Refresh available checkpoints"""
     choices = dit_handler.get_available_checkpoints()
             return current_lyrics
+def handle_simple_instrumental_change(is_instrumental: bool):
+    """
+    Handle simple mode instrumental checkbox changes.
+    When checked: set vocal_language to ["unknown"] and disable editing.
+    When unchecked: enable vocal_language editing.
+    Args:
+        is_instrumental: Whether instrumental checkbox is checked
+    Returns:
+        gr.update for simple_vocal_language dropdown
+    """
+    if is_instrumental:
+        return gr.update(value=["unknown"], interactive=False)
+    else:
+        return gr.update(interactive=True)
 def update_audio_components_visibility(batch_size):
     """Show/hide individual audio components based on batch size (1-8)
     return updates_row1 + updates_row2
+def handle_generation_mode_change(mode: str):
+    """
+    Handle generation mode change between Simple and Custom modes.
+    In Simple mode:
+    - Show simple mode group (query input, instrumental checkbox, create button)
+    - Collapse caption and lyrics accordions
+    - Hide optional parameters accordion
+    - Disable generate button until sample is created
+    In Custom mode:
+    - Hide simple mode group
+    - Expand caption and lyrics accordions
+    - Show optional parameters accordion
+    - Enable generate button
+    Args:
+        mode: "simple" or "custom"
+    Returns:
+        Tuple of updates for:
+        - simple_mode_group (visibility)
+        - caption_accordion (open state)
+        - lyrics_accordion (open state)
+        - generate_btn (interactive state)
+        - simple_sample_created (reset state)
+        - optional_params_accordion (visibility)
+    """
+    is_simple = mode == "simple"
+    return (
+        gr.update(visible=is_simple),  # simple_mode_group
+        gr.update(open=not is_simple),  # caption_accordion - collapsed in simple, open in custom
+        gr.update(open=not is_simple),  # lyrics_accordion - collapsed in simple, open in custom
+        gr.update(interactive=not is_simple),  # generate_btn - disabled in simple until sample created
+        False,  # simple_sample_created - reset to False on mode change
+        gr.update(open=not is_simple),  # optional_params_accordion - hidden in simple mode
+    )
+def handle_create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool,
+    vocal_language: list,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Create Sample button click in Simple mode.
+    Creates a sample from the user's query using the LLM, then populates
+    the caption, lyrics, and metadata fields.
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: LLM handler instance
+        query: User's natural language music description
+        instrumental: Whether to generate instrumental music
+        vocal_language: List of preferred vocal languages for constrained decoding
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - instrumental_checkbox
+        - caption_accordion (open)
+        - lyrics_accordion (open)
+        - generate_btn (interactive)
+        - simple_sample_created (True)
+        - think_checkbox (True)
+        - is_format_caption_state (True)
+        - status_output
+    """
+    # Validate query
+    if not query or not query.strip():
+        gr.Warning(t("messages.empty_query"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.empty_query"),  # status_output
+        )
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call create_sample API
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    result = create_sample(
+        llm_handler=llm_handler,
+        query=query,
+        instrumental=instrumental,
+        vocal_language=vocal_language,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.sample_creation_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.sample_creation_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.sample_created"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.timesignature,  # time_signature
+        result.instrumental,  # instrumental_checkbox
+        gr.update(open=True),  # caption_accordion - expand
+        gr.update(open=True),  # lyrics_accordion - expand
+        gr.update(interactive=True),  # generate_btn - enable
+        True,  # simple_sample_created - True
+        True,  # think_checkbox - enable thinking
+        True,  # is_format_caption_state - True (LM-generated)
+        result.status_message,  # status_output
+    )

acestep/gradio_ui/i18n/en.json CHANGED Viewed

@@ -79,11 +79,20 @@
     "repainting_controls": "🎨 Repainting Controls (seconds)",
     "repainting_start": "Repainting Start",
     "repainting_end": "Repainting End",
     "caption_title": "📝 Music Caption",
     "caption_label": "Music Caption (optional)",
     "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
     "caption_info": "Describe the style, genre, instruments, and mood",
-    "sample_btn": "Sample",
     "lyrics_title": "📝 Lyrics",
     "lyrics_label": "Lyrics (optional)",
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
@@ -212,6 +221,12 @@
     "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
     "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
     "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
-    "lrc_empty_result": "⚠️ LRC generation produced empty result."
   }
 }

     "repainting_controls": "🎨 Repainting Controls (seconds)",
     "repainting_start": "Repainting Start",
     "repainting_end": "Repainting End",
+    "mode_label": "Generation Mode",
+    "mode_info": "Simple: describe music in natural language. Custom: full control over caption and lyrics.",
+    "mode_simple": "Simple",
+    "mode_custom": "Custom",
+    "simple_query_label": "Song Description",
+    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
+    "simple_query_info": "Enter a natural language description of the music you want to generate",
+    "simple_vocal_language_label": "Vocal Language (optional)",
+    "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
+    "create_sample_btn": "Create Sample",
     "caption_title": "📝 Music Caption",
     "caption_label": "Music Caption (optional)",
     "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
     "caption_info": "Describe the style, genre, instruments, and mood",
     "lyrics_title": "📝 Lyrics",
     "lyrics_label": "Lyrics (optional)",
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
     "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
     "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
     "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
+    "lrc_empty_result": "⚠️ LRC generation produced empty result.",
+    "empty_query": "⚠️ Please enter a music description.",
+    "sample_creation_failed": "❌ Failed to create sample. Please try again.",
+    "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
+    "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
+    "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
+    "simple_example_loaded": "🎲 Loaded random example from {filename}"
   }
 }

acestep/gradio_ui/i18n/ja.json CHANGED Viewed

@@ -79,11 +79,20 @@
     "repainting_controls": "🎨 再描画コントロール(秒)",
     "repainting_start": "再描画開始",
     "repainting_end": "再描画終了",
     "caption_title": "📝 音楽キャプション",
     "caption_label": "音楽キャプション(オプション)",
     "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
     "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
-    "sample_btn": "サンプル",
     "lyrics_title": "📝 歌詞",
     "lyrics_label": "歌詞(オプション)",
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
@@ -212,6 +221,12 @@
     "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
     "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
     "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
-    "lrc_empty_result": "⚠️ LRC生成の結果が空です。"
   }
 }

     "repainting_controls": "🎨 再描画コントロール(秒)",
     "repainting_start": "再描画開始",
     "repainting_end": "再描画終了",
+    "mode_label": "生成モード",
+    "mode_info": "シンプル：自然言語で音楽を説明。カスタム：キャプションと歌詞を完全にコントロール。",
+    "mode_simple": "シンプル",
+    "mode_custom": "カスタム",
+    "simple_query_label": "曲の説明",
+    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'",
+    "simple_query_info": "生成したい音楽の自然言語の説明を入力",
+    "simple_vocal_language_label": "ボーカル言語(オプション)",
+    "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
+    "create_sample_btn": "サンプル作成",
     "caption_title": "📝 音楽キャプション",
     "caption_label": "音楽キャプション(オプション)",
     "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
     "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
     "lyrics_title": "📝 歌詞",
     "lyrics_label": "歌詞(オプション)",
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
     "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
     "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
     "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
+    "lrc_empty_result": "⚠️ LRC生成の結果が空です。",
+    "empty_query": "⚠️ 音楽の説明を入力してください。",
+    "sample_creation_failed": "❌ サンプルの作成に失敗しました。もう一度お試しください。",
+    "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
+    "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
+    "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
+    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
   }
 }

acestep/gradio_ui/i18n/zh.json CHANGED Viewed

@@ -79,11 +79,20 @@
     "repainting_controls": "🎨 重绘控制(秒)",
     "repainting_start": "重绘开始",
     "repainting_end": "重绘结束",
     "caption_title": "📝 音乐描述",
     "caption_label": "音乐描述(可选)",
     "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
     "caption_info": "描述风格、流派、乐器和情绪",
-    "sample_btn": "示例",
     "lyrics_title": "📝 歌词",
     "lyrics_label": "歌词(可选)",
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
@@ -212,6 +221,12 @@
     "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
     "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
     "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
-    "lrc_empty_result": "⚠️ LRC生成结果为空。"
   }
 }

     "repainting_controls": "🎨 重绘控制(秒)",
     "repainting_start": "重绘开始",
     "repainting_end": "重绘结束",
+    "mode_label": "生成模式",
+    "mode_info": "简单模式：用自然语言描述音乐。自定义模式：完全控制描述和歌词。",
+    "mode_simple": "简单",
+    "mode_custom": "自定义",
+    "simple_query_label": "歌曲描述",
+    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'",
+    "simple_query_info": "输入你想生成的音乐的自然语言描述",
+    "simple_vocal_language_label": "人声语言(可选)",
+    "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
+    "create_sample_btn": "创建样本",
     "caption_title": "📝 音乐描述",
     "caption_label": "音乐描述(可选)",
     "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
     "caption_info": "描述风格、流派、乐器和情绪",
     "lyrics_title": "📝 歌词",
     "lyrics_label": "歌词(可选)",
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
     "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
     "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
     "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
+    "lrc_empty_result": "⚠️ LRC生成结果为空。",
+    "empty_query": "⚠️ 请输入音乐描述。",
+    "sample_creation_failed": "❌ 创建样本失败。请重试。",
+    "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
+    "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
+    "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
+    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
   }
 }

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -250,9 +250,64 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                                 minimum=-1,
                                 step=0.1,
                             )
-                # Music Caption
-                with gr.Accordion(t("generation.caption_title"), open=True):
                     with gr.Row(equal_height=True):
                         captions = gr.Textbox(
                             label=t("generation.caption_label"),
@@ -262,14 +317,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                             scale=9,
                         )
                         sample_btn = gr.Button(
-                            t("generation.sample_btn"),
                             variant="secondary",
                             size="sm",
                             scale=1,
                         )
-                # Lyrics
-                with gr.Accordion(t("generation.lyrics_title"), open=True):
                     lyrics = gr.Textbox(
                         label=t("generation.lyrics_label"),
                         placeholder=t("generation.lyrics_placeholder"),
@@ -283,7 +338,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     )
                 # Optional Parameters
-                with gr.Accordion(t("generation.optional_params"), open=True):
                     with gr.Row():
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
@@ -587,6 +642,19 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "repainting_start": repainting_start,
         "repainting_end": repainting_end,
         "audio_cover_strength": audio_cover_strength,
         "captions": captions,
         "sample_btn": sample_btn,
         "load_file": load_file,

                                 minimum=-1,
                                 step=0.1,
                             )
+                    # Simple/Custom Mode Toggle
+                    with gr.Row():
+                        generation_mode = gr.Radio(
+                            choices=[
+                                (t("generation.mode_simple"), "simple"),
+                                (t("generation.mode_custom"), "custom"),
+                            ],
+                            value="simple",
+                            label=t("generation.mode_label"),
+                            info=t("generation.mode_info"),
+                        )
+                    # Simple Mode Components - visible only in Simple mode
+                    with gr.Group(visible=True) as simple_mode_group:
+                        with gr.Row(equal_height=True):
+                            simple_query_input = gr.Textbox(
+                                label=t("generation.simple_query_label"),
+                                placeholder=t("generation.simple_query_placeholder"),
+                                lines=2,
+                                info=t("generation.simple_query_info"),
+                                scale=12,
+                            )
+                            with gr.Column(scale=1, min_width=100):
+                                random_desc_btn = gr.Button(
+                                    "🎲",
+                                    variant="secondary",
+                                    size="sm",
+                                    scale=2
+                                )
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=1, variant="compact"):
+                                simple_instrumental_checkbox = gr.Checkbox(
+                                    label=t("generation.instrumental_label"),
+                                    value=False,
+                                )
+                            with gr.Column(scale=18):
+                                create_sample_btn = gr.Button(
+                                    t("generation.create_sample_btn"),
+                                    variant="primary",
+                                    size="lg",
+                                )
+                            with gr.Column(scale=1, variant="compact"):
+                                simple_vocal_language = gr.Dropdown(
+                                    choices=VALID_LANGUAGES,
+                                    value="unknown",
+                                    allow_custom_value=True,
+                                    label=t("generation.simple_vocal_language_label"),
+                                    interactive=True,
+                                )
+                    # State to track if sample has been created in Simple mode
+                    simple_sample_created = gr.State(value=False)
+                # Music Caption - wrapped in accordion that can be collapsed in Simple mode
+                with gr.Accordion(t("generation.caption_title"), open=False) as caption_accordion:
                     with gr.Row(equal_height=True):
                         captions = gr.Textbox(
                             label=t("generation.caption_label"),
                             scale=9,
                         )
                         sample_btn = gr.Button(
+                            "🎲",
                             variant="secondary",
                             size="sm",
                             scale=1,
                         )
+                # Lyrics - wrapped in accordion that can be collapsed in Simple mode
+                with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
                         label=t("generation.lyrics_label"),
                         placeholder=t("generation.lyrics_placeholder"),
                     )
                 # Optional Parameters
+                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
                     with gr.Row():
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
         "repainting_start": repainting_start,
         "repainting_end": repainting_end,
         "audio_cover_strength": audio_cover_strength,
+        # Simple/Custom Mode Components
+        "generation_mode": generation_mode,
+        "simple_mode_group": simple_mode_group,
+        "simple_query_input": simple_query_input,
+        "random_desc_btn": random_desc_btn,
+        "simple_instrumental_checkbox": simple_instrumental_checkbox,
+        "simple_vocal_language": simple_vocal_language,
+        "create_sample_btn": create_sample_btn,
+        "simple_sample_created": simple_sample_created,
+        "caption_accordion": caption_accordion,
+        "lyrics_accordion": lyrics_accordion,
+        "optional_params_accordion": optional_params_accordion,
+        # Existing components
         "captions": captions,
         "sample_btn": sample_btn,
         "load_file": load_file,

acestep/inference.py CHANGED Viewed

@@ -797,3 +797,188 @@ def understand_music(
             success=False,
             error=str(e),
         )

             success=False,
             error=str(e),
         )
+@dataclass
+class CreateSampleResult:
+    """Result of creating a music sample from a natural language query.
+    This is used by the "Simple Mode" / "Inspiration Mode" feature where users
+    provide a natural language description and the LLM generates a complete
+    sample with caption, lyrics, and metadata.
+    Attributes:
+        # Metadata Fields
+        caption: Generated detailed music description/caption
+        lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
+        bpm: Beats per minute (None if not generated)
+        duration: Duration in seconds (None if not generated)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        instrumental: Whether this is an instrumental piece
+        # Status
+        status_message: Status message from sample creation
+        success: Whether sample creation completed successfully
+        error: Error message if sample creation failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[List[str]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult:
+    """Create a music sample from a natural language query using the 5Hz Language Model.
+    This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
+    language description of music and generates a complete sample including:
+    - Detailed caption/description
+    - Lyrics (unless instrumental)
+    - Metadata (BPM, duration, key, language, time signature)
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        query: User's natural language music description (e.g., "a soft Bengali love song")
+        instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
+                       If provided, the model will be constrained to generate lyrics in these languages.
+                       If None or ["unknown"], no language constraint is applied.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        CreateSampleResult with generated sample fields and status
+    Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"Lyrics: {result.lyrics}")
+        ...     print(f"BPM: {result.bpm}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return CreateSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    # Validate query
+    if not query or not query.strip():
+        return CreateSampleResult(
+            status_message="No query provided. Please enter a music description.",
+            success=False,
+            error="Empty query",
+        )
+    try:
+        # Call LLM to create sample
+        metadata, status = llm_handler.create_sample_from_query(
+            query=query,
+            instrumental=instrumental,
+            vocal_language=vocal_language,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return CreateSampleResult(
+                status_message=status or "Failed to create sample",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        is_instrumental = metadata.get('instrumental', instrumental)
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return CreateSampleResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            instrumental=is_instrumental,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Sample creation failed")
+        return CreateSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

acestep/llm_inference.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
-from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION
 class LLMHandler:
@@ -308,7 +308,7 @@ class LLMHandler:
             if not os.path.exists(full_lm_model_path):
                 return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
-            logger.info("loading 5Hz LM tokenizer...")
             start_time = time.time()
             # TODO: load tokenizer too slow, not found solution yet
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
@@ -1433,6 +1433,185 @@ class LLMHandler:
         return after_think.strip()
     def generate_from_formatted_prompt(
         self,
         formatted_prompt: str,

     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
+from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
 class LLMHandler:
             if not os.path.exists(full_lm_model_path):
                 return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
+            logger.info("loading 5Hz LM tokenizer... it may take 80~90s")
             start_time = time.time()
             # TODO: load tokenizer too slow, not found solution yet
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
         return after_think.strip()
+    def build_formatted_prompt_for_inspiration(
+        self,
+        query: str,
+        instrumental: bool = False,
+        is_negative_prompt: bool = False,
+        negative_prompt: str = "NO USER INPUT"
+    ) -> str:
+        """
+        Build the chat-formatted prompt for inspiration/simple mode.
+        This generates a complete sample (caption, lyrics, metadata) from a user's
+        natural language music description query.
+        Args:
+            query: User's natural language music description
+            instrumental: Whether to generate instrumental music (no vocals)
+            is_negative_prompt: If True, builds unconditional prompt for CFG
+            negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
+        Returns:
+            Formatted prompt string
+        Example:
+            query = "a soft Bengali love song for a quiet evening"
+            prompt = handler.build_formatted_prompt_for_inspiration(query, instrumental=False)
+        """
+        if self.llm_tokenizer is None:
+            raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
+        # Build user content with query and instrumental flag
+        instrumental_str = "true" if instrumental else "false"
+        if is_negative_prompt:
+            # For CFG unconditional prompt
+            user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
+        else:
+            # Normal prompt: query + instrumental flag
+            user_content = f"{query}\n\ninstrumental: {instrumental_str}"
+        return self.llm_tokenizer.apply_chat_template(
+            [
+                {
+                    "role": "system",
+                    "content": f"# Instruction\n{DEFAULT_LM_INSPIRED_INSTRUCTION}\n\n"
+                },
+                {
+                    "role": "user",
+                    "content": user_content
+                },
+            ],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    def create_sample_from_query(
+        self,
+        query: str,
+        instrumental: bool = False,
+        vocal_language: Optional[List[str]] = None,
+        temperature: float = 0.85,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+    ) -> Tuple[Dict[str, Any], str]:
+        """
+        Create a complete music sample from a user's natural language query.
+        This is the "Simple Mode" / "Inspiration Mode" feature that generates:
+        - Metadata (bpm, caption, duration, keyscale, language, timesignature)
+        - Lyrics (unless instrumental=True)
+        Args:
+            query: User's natural language music description
+            instrumental: Whether to generate instrumental music (no vocals)
+            vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
+                           If provided and not ["unknown"], the first language will be used.
+            temperature: Sampling temperature for generation (0.0-2.0)
+            top_k: Top-K sampling (None = disabled)
+            top_p: Top-P (nucleus) sampling (None = disabled)
+            repetition_penalty: Repetition penalty (1.0 = no penalty)
+            use_constrained_decoding: Whether to use FSM-based constrained decoding
+            constrained_decoding_debug: Whether to enable debug logging
+        Returns:
+            Tuple of (metadata_dict, status_message)
+            metadata_dict contains:
+                - bpm: int or str
+                - caption: str
+                - duration: int or str
+                - keyscale: str
+                - language: str
+                - timesignature: str
+                - lyrics: str (extracted from output after </think>)
+                - instrumental: bool (echoed back)
+        Example:
+            query = "a soft Bengali love song for a quiet evening"
+            metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
+            print(metadata['caption'])  # "A gentle romantic acoustic pop ballad..."
+            print(metadata['lyrics'])   # "[Intro: ...]\\n..."
+        """
+        if not getattr(self, "llm_initialized", False):
+            return {}, "❌ 5Hz LM not initialized. Please initialize it first."
+        if not query or not query.strip():
+            return {}, "❌ No query provided. Please enter a music description."
+        logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
+        # Build formatted prompt for inspiration
+        formatted_prompt = self.build_formatted_prompt_for_inspiration(
+            query=query,
+            instrumental=instrumental,
+        )
+        logger.debug(f"Formatted prompt for inspiration: {formatted_prompt}")
+        # Build user_metadata if vocal_language is specified and is not "unknown"
+        user_metadata = None
+        skip_language = False
+        if vocal_language and len(vocal_language) > 0:
+            # Filter out "unknown" from the list
+            valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
+            if valid_languages:
+                # Use the first valid language for constrained decoding
+                user_metadata = {"language": valid_languages[0]}
+                skip_language = True  # Skip language generation since we're injecting it
+                logger.info(f"Using user-specified language: {valid_languages[0]}")
+        # Generate using constrained decoding (inspiration phase)
+        # Similar to understand mode - generate metadata first (CoT), then lyrics
+        # Note: cfg_scale and negative_prompt are not used in create_sample mode
+        output_text, status = self.generate_from_formatted_prompt(
+            formatted_prompt=formatted_prompt,
+            cfg={
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "repetition_penalty": repetition_penalty,
+                "target_duration": None,  # No duration constraint
+                "user_metadata": user_metadata,  # Inject language if specified
+                "skip_caption": False,  # Generate caption
+                "skip_language": skip_language,  # Skip if we're injecting language
+                "skip_genres": False,  # Generate genres
+                "generation_phase": "understand",  # Use understand phase for metadata + free-form lyrics
+                "caption": "",
+                "lyrics": "",
+            },
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            stop_at_reasoning=False,  # Continue after </think> to generate lyrics
+        )
+        if not output_text:
+            return {}, status
+        # Parse metadata and extract lyrics
+        metadata, _ = self.parse_lm_output(output_text)
+        # Extract lyrics section (everything after </think>)
+        lyrics = self._extract_lyrics_from_output(output_text)
+        if lyrics:
+            metadata['lyrics'] = lyrics
+        elif instrumental:
+            # For instrumental, set empty lyrics or placeholder
+            metadata['lyrics'] = "[Instrumental]"
+        # Echo back the instrumental flag
+        metadata['instrumental'] = instrumental
+        logger.info(f"Sample created successfully. Generated {len(metadata)} fields")
+        if constrained_decoding_debug:
+            logger.debug(f"Generated metadata: {list(metadata.keys())}")
+            logger.debug(f"Output text preview: {output_text[:300]}...")
+        status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
+        return metadata, status_msg
     def generate_from_formatted_prompt(
         self,
         formatted_prompt: str,

examples/simple_mode/example_01.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "a soft Bengali love song for a quiet evening",
+    "instrumental": false,
+    "vocal_language": ["bn"]
+}

examples/simple_mode/example_02.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "an upbeat summer pop song with catchy hooks",
+    "instrumental": false,
+    "vocal_language": ["en"]
+}

examples/simple_mode/example_03.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "epic orchestral cinematic music for a movie trailer",
+    "instrumental": true,
+    "vocal_language": ["unknown"]
+}

examples/simple_mode/example_04.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "一首深情的中文抒情歌曲，适合夜晚独自聆听",
+    "instrumental": false,
+    "vocal_language": ["zh"]
+}

examples/simple_mode/example_05.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "Japanese city pop with nostalgic 80s vibes",
+    "instrumental": false,
+    "vocal_language": ["ja"]
+}

examples/simple_mode/example_06.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "lo-fi hip hop beats for studying and relaxing",
+    "instrumental": true,
+    "vocal_language": ["unknown"]
+}

examples/simple_mode/example_07.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "energetic K-pop dance track with powerful vocals",
+    "instrumental": false,
+    "vocal_language": ["ko"]
+}

examples/simple_mode/example_08.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "romantic Spanish guitar ballad with heartfelt lyrics",
+    "instrumental": false,
+    "vocal_language": ["es"]
+}

examples/simple_mode/example_09.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "中国风电子舞曲，融合古典乐器与现代节拍",
+    "instrumental": false,
+    "vocal_language": ["zh"]
+}

examples/simple_mode/example_10.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "description": "peaceful piano melody for meditation and relaxation",
+    "instrumental": true,
+    "vocal_language": ["unknown"]
+}