Gong Junmin commited on
Commit
2d3816e
·
unverified ·
2 Parent(s): d984ea0 4a86c5f

Merge pull request #6 from ace-step/add_simple_mode

Browse files
acestep/gradio_ui/events/__init__.py CHANGED
@@ -121,11 +121,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
121
  )
122
 
123
  # ========== Sample/Transcribe Handlers ==========
 
124
  generation_section["sample_btn"].click(
125
- fn=lambda task, debug: gen_h.sample_example_smart(llm_handler, task, debug) + (True,),
126
  inputs=[
127
  generation_section["task_type"],
128
- generation_section["constrained_decoding_debug"]
129
  ],
130
  outputs=[
131
  generation_section["captions"],
@@ -190,6 +190,73 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
190
  outputs=[generation_section["lyrics"]]
191
  )
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # ========== Load/Save Metadata ==========
194
  generation_section["load_file"].upload(
195
  fn=gen_h.load_metadata,
 
121
  )
122
 
123
  # ========== Sample/Transcribe Handlers ==========
124
+ # Load random example from ./examples/text2music directory
125
  generation_section["sample_btn"].click(
126
+ fn=lambda task: gen_h.load_random_example(task) + (True,),
127
  inputs=[
128
  generation_section["task_type"],
 
129
  ],
130
  outputs=[
131
  generation_section["captions"],
 
190
  outputs=[generation_section["lyrics"]]
191
  )
192
 
193
+ # ========== Simple/Custom Mode Toggle ==========
194
+ generation_section["generation_mode"].change(
195
+ fn=gen_h.handle_generation_mode_change,
196
+ inputs=[generation_section["generation_mode"]],
197
+ outputs=[
198
+ generation_section["simple_mode_group"],
199
+ generation_section["caption_accordion"],
200
+ generation_section["lyrics_accordion"],
201
+ generation_section["generate_btn"],
202
+ generation_section["simple_sample_created"],
203
+ generation_section["optional_params_accordion"],
204
+ ]
205
+ )
206
+
207
+ # ========== Simple Mode Instrumental Checkbox ==========
208
+ # When instrumental is checked, disable vocal language and set to ["unknown"]
209
+ generation_section["simple_instrumental_checkbox"].change(
210
+ fn=gen_h.handle_simple_instrumental_change,
211
+ inputs=[generation_section["simple_instrumental_checkbox"]],
212
+ outputs=[generation_section["simple_vocal_language"]]
213
+ )
214
+
215
+ # ========== Random Description Button ==========
216
+ generation_section["random_desc_btn"].click(
217
+ fn=gen_h.load_random_simple_description,
218
+ inputs=[],
219
+ outputs=[
220
+ generation_section["simple_query_input"],
221
+ generation_section["simple_instrumental_checkbox"],
222
+ generation_section["simple_vocal_language"],
223
+ ]
224
+ )
225
+
226
+ # ========== Create Sample Button (Simple Mode) ==========
227
+ # Note: cfg_scale and negative_prompt are not supported in create_sample mode
228
+ generation_section["create_sample_btn"].click(
229
+ fn=lambda query, instrumental, vocal_lang, temp, top_k, top_p, debug: gen_h.handle_create_sample(
230
+ llm_handler, query, instrumental, vocal_lang, temp, top_k, top_p, debug
231
+ ),
232
+ inputs=[
233
+ generation_section["simple_query_input"],
234
+ generation_section["simple_instrumental_checkbox"],
235
+ generation_section["simple_vocal_language"],
236
+ generation_section["lm_temperature"],
237
+ generation_section["lm_top_k"],
238
+ generation_section["lm_top_p"],
239
+ generation_section["constrained_decoding_debug"],
240
+ ],
241
+ outputs=[
242
+ generation_section["captions"],
243
+ generation_section["lyrics"],
244
+ generation_section["bpm"],
245
+ generation_section["audio_duration"],
246
+ generation_section["key_scale"],
247
+ generation_section["vocal_language"],
248
+ generation_section["time_signature"],
249
+ generation_section["instrumental_checkbox"],
250
+ generation_section["caption_accordion"],
251
+ generation_section["lyrics_accordion"],
252
+ generation_section["generate_btn"],
253
+ generation_section["simple_sample_created"],
254
+ generation_section["think_checkbox"],
255
+ results_section["is_format_caption_state"],
256
+ results_section["status_output"],
257
+ ]
258
+ )
259
+
260
  # ========== Load/Save Metadata ==========
261
  generation_section["load_file"].upload(
262
  fn=gen_h.load_metadata,
acestep/gradio_ui/events/generation_handlers.py CHANGED
@@ -13,7 +13,7 @@ from acestep.constants import (
13
  TASK_TYPES_BASE,
14
  )
15
  from acestep.gradio_ui.i18n import t
16
- from acestep.inference import understand_music
17
 
18
 
19
  def load_metadata(file_obj):
@@ -254,6 +254,65 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
254
  return load_random_example(task_type)
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def refresh_checkpoints(dit_handler):
258
  """Refresh available checkpoints"""
259
  choices = dit_handler.get_available_checkpoints()
@@ -502,6 +561,24 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
502
  return current_lyrics
503
 
504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  def update_audio_components_visibility(batch_size):
506
  """Show/hide individual audio components based on batch size (1-8)
507
 
@@ -532,3 +609,192 @@ def update_audio_components_visibility(batch_size):
532
  return updates_row1 + updates_row2
533
 
534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  TASK_TYPES_BASE,
14
  )
15
  from acestep.gradio_ui.i18n import t
16
+ from acestep.inference import understand_music, create_sample
17
 
18
 
19
  def load_metadata(file_obj):
 
254
  return load_random_example(task_type)
255
 
256
 
257
+ def load_random_simple_description():
258
+ """Load a random description from the simple_mode examples directory.
259
+
260
+ Returns:
261
+ Tuple of (description, instrumental, vocal_language) for updating UI components
262
+ """
263
+ try:
264
+ # Get the project root directory
265
+ current_file = os.path.abspath(__file__)
266
+ # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
267
+ project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
268
+
269
+ # Construct the examples directory path
270
+ examples_dir = os.path.join(project_root, "examples", "simple_mode")
271
+
272
+ # Check if directory exists
273
+ if not os.path.exists(examples_dir):
274
+ gr.Warning(t("messages.simple_examples_not_found"))
275
+ return gr.update(), gr.update(), gr.update()
276
+
277
+ # Find all JSON files in the directory
278
+ json_files = glob.glob(os.path.join(examples_dir, "*.json"))
279
+
280
+ if not json_files:
281
+ gr.Warning(t("messages.simple_examples_empty"))
282
+ return gr.update(), gr.update(), gr.update()
283
+
284
+ # Randomly select one file
285
+ selected_file = random.choice(json_files)
286
+
287
+ # Read and parse JSON
288
+ try:
289
+ with open(selected_file, 'r', encoding='utf-8') as f:
290
+ data = json.load(f)
291
+
292
+ # Extract fields
293
+ description = data.get('description', '')
294
+ instrumental = data.get('instrumental', False)
295
+ vocal_language = data.get('vocal_language', ['unknown'])
296
+
297
+ # Ensure vocal_language is a list
298
+ if isinstance(vocal_language, str):
299
+ vocal_language = [vocal_language]
300
+
301
+ gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
302
+ return description, instrumental, vocal_language
303
+
304
+ except json.JSONDecodeError as e:
305
+ gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
306
+ return gr.update(), gr.update(), gr.update()
307
+ except Exception as e:
308
+ gr.Warning(t("messages.example_error", error=str(e)))
309
+ return gr.update(), gr.update(), gr.update()
310
+
311
+ except Exception as e:
312
+ gr.Warning(t("messages.example_error", error=str(e)))
313
+ return gr.update(), gr.update(), gr.update()
314
+
315
+
316
  def refresh_checkpoints(dit_handler):
317
  """Refresh available checkpoints"""
318
  choices = dit_handler.get_available_checkpoints()
 
561
  return current_lyrics
562
 
563
 
564
+ def handle_simple_instrumental_change(is_instrumental: bool):
565
+ """
566
+ Handle simple mode instrumental checkbox changes.
567
+ When checked: set vocal_language to ["unknown"] and disable editing.
568
+ When unchecked: enable vocal_language editing.
569
+
570
+ Args:
571
+ is_instrumental: Whether instrumental checkbox is checked
572
+
573
+ Returns:
574
+ gr.update for simple_vocal_language dropdown
575
+ """
576
+ if is_instrumental:
577
+ return gr.update(value=["unknown"], interactive=False)
578
+ else:
579
+ return gr.update(interactive=True)
580
+
581
+
582
  def update_audio_components_visibility(batch_size):
583
  """Show/hide individual audio components based on batch size (1-8)
584
 
 
609
  return updates_row1 + updates_row2
610
 
611
 
612
+ def handle_generation_mode_change(mode: str):
613
+ """
614
+ Handle generation mode change between Simple and Custom modes.
615
+
616
+ In Simple mode:
617
+ - Show simple mode group (query input, instrumental checkbox, create button)
618
+ - Collapse caption and lyrics accordions
619
+ - Hide optional parameters accordion
620
+ - Disable generate button until sample is created
621
+
622
+ In Custom mode:
623
+ - Hide simple mode group
624
+ - Expand caption and lyrics accordions
625
+ - Show optional parameters accordion
626
+ - Enable generate button
627
+
628
+ Args:
629
+ mode: "simple" or "custom"
630
+
631
+ Returns:
632
+ Tuple of updates for:
633
+ - simple_mode_group (visibility)
634
+ - caption_accordion (open state)
635
+ - lyrics_accordion (open state)
636
+ - generate_btn (interactive state)
637
+ - simple_sample_created (reset state)
638
+ - optional_params_accordion (visibility)
639
+ """
640
+ is_simple = mode == "simple"
641
+
642
+ return (
643
+ gr.update(visible=is_simple), # simple_mode_group
644
+ gr.update(open=not is_simple), # caption_accordion - collapsed in simple, open in custom
645
+ gr.update(open=not is_simple), # lyrics_accordion - collapsed in simple, open in custom
646
+ gr.update(interactive=not is_simple), # generate_btn - disabled in simple until sample created
647
+ False, # simple_sample_created - reset to False on mode change
648
+ gr.update(open=not is_simple), # optional_params_accordion - hidden in simple mode
649
+ )
650
+
651
+
652
+ def handle_create_sample(
653
+ llm_handler,
654
+ query: str,
655
+ instrumental: bool,
656
+ vocal_language: list,
657
+ lm_temperature: float,
658
+ lm_top_k: int,
659
+ lm_top_p: float,
660
+ constrained_decoding_debug: bool = False,
661
+ ):
662
+ """
663
+ Handle the Create Sample button click in Simple mode.
664
+
665
+ Creates a sample from the user's query using the LLM, then populates
666
+ the caption, lyrics, and metadata fields.
667
+
668
+ Note: cfg_scale and negative_prompt are not supported in create_sample mode.
669
+
670
+ Args:
671
+ llm_handler: LLM handler instance
672
+ query: User's natural language music description
673
+ instrumental: Whether to generate instrumental music
674
+ vocal_language: List of preferred vocal languages for constrained decoding
675
+ lm_temperature: LLM temperature for generation
676
+ lm_top_k: LLM top-k sampling
677
+ lm_top_p: LLM top-p sampling
678
+ constrained_decoding_debug: Whether to enable debug logging
679
+
680
+ Returns:
681
+ Tuple of updates for:
682
+ - captions
683
+ - lyrics
684
+ - bpm
685
+ - audio_duration
686
+ - key_scale
687
+ - vocal_language
688
+ - time_signature
689
+ - instrumental_checkbox
690
+ - caption_accordion (open)
691
+ - lyrics_accordion (open)
692
+ - generate_btn (interactive)
693
+ - simple_sample_created (True)
694
+ - think_checkbox (True)
695
+ - is_format_caption_state (True)
696
+ - status_output
697
+ """
698
+ # Validate query
699
+ if not query or not query.strip():
700
+ gr.Warning(t("messages.empty_query"))
701
+ return (
702
+ gr.update(), # captions - no change
703
+ gr.update(), # lyrics - no change
704
+ gr.update(), # bpm - no change
705
+ gr.update(), # audio_duration - no change
706
+ gr.update(), # key_scale - no change
707
+ gr.update(), # vocal_language - no change
708
+ gr.update(), # time_signature - no change
709
+ gr.update(), # instrumental_checkbox - no change
710
+ gr.update(), # caption_accordion - no change
711
+ gr.update(), # lyrics_accordion - no change
712
+ gr.update(interactive=False), # generate_btn - keep disabled
713
+ False, # simple_sample_created - still False
714
+ gr.update(), # think_checkbox - no change
715
+ gr.update(), # is_format_caption_state - no change
716
+ t("messages.empty_query"), # status_output
717
+ )
718
+
719
+ # Check if LLM is initialized
720
+ if not llm_handler.llm_initialized:
721
+ gr.Warning(t("messages.lm_not_initialized"))
722
+ return (
723
+ gr.update(), # captions - no change
724
+ gr.update(), # lyrics - no change
725
+ gr.update(), # bpm - no change
726
+ gr.update(), # audio_duration - no change
727
+ gr.update(), # key_scale - no change
728
+ gr.update(), # vocal_language - no change
729
+ gr.update(), # time_signature - no change
730
+ gr.update(), # instrumental_checkbox - no change
731
+ gr.update(), # caption_accordion - no change
732
+ gr.update(), # lyrics_accordion - no change
733
+ gr.update(interactive=False), # generate_btn - keep disabled
734
+ False, # simple_sample_created - still False
735
+ gr.update(), # think_checkbox - no change
736
+ gr.update(), # is_format_caption_state - no change
737
+ t("messages.lm_not_initialized"), # status_output
738
+ )
739
+
740
+ # Convert LM parameters
741
+ top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
742
+ top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
743
+
744
+ # Call create_sample API
745
+ # Note: cfg_scale and negative_prompt are not supported in create_sample mode
746
+ result = create_sample(
747
+ llm_handler=llm_handler,
748
+ query=query,
749
+ instrumental=instrumental,
750
+ vocal_language=vocal_language,
751
+ temperature=lm_temperature,
752
+ top_k=top_k_value,
753
+ top_p=top_p_value,
754
+ use_constrained_decoding=True,
755
+ constrained_decoding_debug=constrained_decoding_debug,
756
+ )
757
+
758
+ # Handle error
759
+ if not result.success:
760
+ gr.Warning(result.status_message or t("messages.sample_creation_failed"))
761
+ return (
762
+ gr.update(), # captions - no change
763
+ gr.update(), # lyrics - no change
764
+ gr.update(), # bpm - no change
765
+ gr.update(), # audio_duration - no change
766
+ gr.update(), # key_scale - no change
767
+ gr.update(), # vocal_language - no change
768
+ gr.update(), # time_signature - no change
769
+ gr.update(), # instrumental_checkbox - no change
770
+ gr.update(), # caption_accordion - no change
771
+ gr.update(), # lyrics_accordion - no change
772
+ gr.update(interactive=False), # generate_btn - keep disabled
773
+ False, # simple_sample_created - still False
774
+ gr.update(), # think_checkbox - no change
775
+ gr.update(), # is_format_caption_state - no change
776
+ result.status_message or t("messages.sample_creation_failed"), # status_output
777
+ )
778
+
779
+ # Success - populate fields
780
+ gr.Info(t("messages.sample_created"))
781
+
782
+ return (
783
+ result.caption, # captions
784
+ result.lyrics, # lyrics
785
+ result.bpm, # bpm
786
+ result.duration if result.duration and result.duration > 0 else -1, # audio_duration
787
+ result.keyscale, # key_scale
788
+ result.language, # vocal_language
789
+ result.timesignature, # time_signature
790
+ result.instrumental, # instrumental_checkbox
791
+ gr.update(open=True), # caption_accordion - expand
792
+ gr.update(open=True), # lyrics_accordion - expand
793
+ gr.update(interactive=True), # generate_btn - enable
794
+ True, # simple_sample_created - True
795
+ True, # think_checkbox - enable thinking
796
+ True, # is_format_caption_state - True (LM-generated)
797
+ result.status_message, # status_output
798
+ )
799
+
800
+
acestep/gradio_ui/i18n/en.json CHANGED
@@ -79,11 +79,20 @@
79
  "repainting_controls": "🎨 Repainting Controls (seconds)",
80
  "repainting_start": "Repainting Start",
81
  "repainting_end": "Repainting End",
 
 
 
 
 
 
 
 
 
 
82
  "caption_title": "📝 Music Caption",
83
  "caption_label": "Music Caption (optional)",
84
  "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
85
  "caption_info": "Describe the style, genre, instruments, and mood",
86
- "sample_btn": "Sample",
87
  "lyrics_title": "📝 Lyrics",
88
  "lyrics_label": "Lyrics (optional)",
89
  "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
@@ -212,6 +221,12 @@
212
  "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
213
  "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
214
  "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
215
- "lrc_empty_result": "⚠️ LRC generation produced empty result."
 
 
 
 
 
 
216
  }
217
  }
 
79
  "repainting_controls": "🎨 Repainting Controls (seconds)",
80
  "repainting_start": "Repainting Start",
81
  "repainting_end": "Repainting End",
82
+ "mode_label": "Generation Mode",
83
+ "mode_info": "Simple: describe music in natural language. Custom: full control over caption and lyrics.",
84
+ "mode_simple": "Simple",
85
+ "mode_custom": "Custom",
86
+ "simple_query_label": "Song Description",
87
+ "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
88
+ "simple_query_info": "Enter a natural language description of the music you want to generate",
89
+ "simple_vocal_language_label": "Vocal Language (optional)",
90
+ "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
91
+ "create_sample_btn": "Create Sample",
92
  "caption_title": "📝 Music Caption",
93
  "caption_label": "Music Caption (optional)",
94
  "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
95
  "caption_info": "Describe the style, genre, instruments, and mood",
 
96
  "lyrics_title": "📝 Lyrics",
97
  "lyrics_label": "Lyrics (optional)",
98
  "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
 
221
  "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
222
  "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
223
  "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
224
+ "lrc_empty_result": "⚠️ LRC generation produced empty result.",
225
+ "empty_query": "⚠️ Please enter a music description.",
226
+ "sample_creation_failed": "❌ Failed to create sample. Please try again.",
227
+ "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
228
+ "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
229
+ "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
230
+ "simple_example_loaded": "🎲 Loaded random example from {filename}"
231
  }
232
  }
acestep/gradio_ui/i18n/ja.json CHANGED
@@ -79,11 +79,20 @@
79
  "repainting_controls": "🎨 再描画コントロール(秒)",
80
  "repainting_start": "再描画開始",
81
  "repainting_end": "再描画終了",
 
 
 
 
 
 
 
 
 
 
82
  "caption_title": "📝 音楽キャプション",
83
  "caption_label": "音楽キャプション(オプション)",
84
  "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
85
  "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
86
- "sample_btn": "サンプル",
87
  "lyrics_title": "📝 歌詞",
88
  "lyrics_label": "歌詞(オプション)",
89
  "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
@@ -212,6 +221,12 @@
212
  "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
213
  "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
214
  "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
215
- "lrc_empty_result": "⚠️ LRC生成の結果が空です。"
 
 
 
 
 
 
216
  }
217
  }
 
79
  "repainting_controls": "🎨 再描画コントロール(秒)",
80
  "repainting_start": "再描画開始",
81
  "repainting_end": "再描画終了",
82
+ "mode_label": "生成モード",
83
+ "mode_info": "シンプル:自然言語で音楽を説明。カスタム:キャプションと歌詞を完全にコントロール。",
84
+ "mode_simple": "シンプル",
85
+ "mode_custom": "カスタム",
86
+ "simple_query_label": "曲の説明",
87
+ "simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'",
88
+ "simple_query_info": "生成したい音楽の自然言語の説明を入力",
89
+ "simple_vocal_language_label": "ボーカル言語(オプション)",
90
+ "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
91
+ "create_sample_btn": "サンプル作成",
92
  "caption_title": "📝 音楽キャプション",
93
  "caption_label": "音楽キャプション(オプション)",
94
  "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
95
  "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
 
96
  "lyrics_title": "📝 歌詞",
97
  "lyrics_label": "歌詞(オプション)",
98
  "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
 
221
  "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
222
  "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
223
  "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
224
+ "lrc_empty_result": "⚠️ LRC生成の結果が空です。",
225
+ "empty_query": "⚠️ 音楽の説明を入力してください。",
226
+ "sample_creation_failed": "❌ サンプルの作成に失敗しました。もう一度お試しください。",
227
+ "sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
228
+ "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
229
+ "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
230
+ "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
231
  }
232
  }
acestep/gradio_ui/i18n/zh.json CHANGED
@@ -79,11 +79,20 @@
79
  "repainting_controls": "🎨 重绘控制(秒)",
80
  "repainting_start": "重绘开始",
81
  "repainting_end": "重绘结束",
 
 
 
 
 
 
 
 
 
 
82
  "caption_title": "📝 音乐描述",
83
  "caption_label": "音乐描述(可选)",
84
  "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
85
  "caption_info": "描述风格、流派、乐器和情绪",
86
- "sample_btn": "示例",
87
  "lyrics_title": "📝 歌词",
88
  "lyrics_label": "歌词(可选)",
89
  "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
@@ -212,6 +221,12 @@
212
  "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
213
  "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
214
  "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
215
- "lrc_empty_result": "⚠️ LRC生成结果为空。"
 
 
 
 
 
 
216
  }
217
  }
 
79
  "repainting_controls": "🎨 重绘控制(秒)",
80
  "repainting_start": "重绘开始",
81
  "repainting_end": "重绘结束",
82
+ "mode_label": "生成模式",
83
+ "mode_info": "简单模式:用自然语言描述音乐。自定义模式:完全控制描述和歌词。",
84
+ "mode_simple": "简单",
85
+ "mode_custom": "自定义",
86
+ "simple_query_label": "歌曲描述",
87
+ "simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'",
88
+ "simple_query_info": "输入你想生成的音乐的自然语言描述",
89
+ "simple_vocal_language_label": "人声语言(可选)",
90
+ "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
91
+ "create_sample_btn": "创建样本",
92
  "caption_title": "📝 音乐描述",
93
  "caption_label": "音乐描述(可选)",
94
  "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
95
  "caption_info": "描述风格、流派、乐器和情绪",
 
96
  "lyrics_title": "📝 歌词",
97
  "lyrics_label": "歌词(可选)",
98
  "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
 
221
  "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
222
  "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
223
  "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
224
+ "lrc_empty_result": "⚠️ LRC生成结果为空。",
225
+ "empty_query": "⚠️ 请输入音乐描述。",
226
+ "sample_creation_failed": "❌ 创建样本失败。请重试。",
227
+ "sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
228
+ "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
229
+ "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
230
+ "simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
231
  }
232
  }
acestep/gradio_ui/interfaces/generation.py CHANGED
@@ -250,9 +250,64 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
250
  minimum=-1,
251
  step=0.1,
252
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- # Music Caption
255
- with gr.Accordion(t("generation.caption_title"), open=True):
256
  with gr.Row(equal_height=True):
257
  captions = gr.Textbox(
258
  label=t("generation.caption_label"),
@@ -262,14 +317,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
262
  scale=9,
263
  )
264
  sample_btn = gr.Button(
265
- t("generation.sample_btn"),
266
  variant="secondary",
267
  size="sm",
268
  scale=1,
269
  )
270
 
271
- # Lyrics
272
- with gr.Accordion(t("generation.lyrics_title"), open=True):
273
  lyrics = gr.Textbox(
274
  label=t("generation.lyrics_label"),
275
  placeholder=t("generation.lyrics_placeholder"),
@@ -283,7 +338,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
283
  )
284
 
285
  # Optional Parameters
286
- with gr.Accordion(t("generation.optional_params"), open=True):
287
  with gr.Row():
288
  vocal_language = gr.Dropdown(
289
  choices=VALID_LANGUAGES,
@@ -587,6 +642,19 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
587
  "repainting_start": repainting_start,
588
  "repainting_end": repainting_end,
589
  "audio_cover_strength": audio_cover_strength,
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  "captions": captions,
591
  "sample_btn": sample_btn,
592
  "load_file": load_file,
 
250
  minimum=-1,
251
  step=0.1,
252
  )
253
+
254
+ # Simple/Custom Mode Toggle
255
+ with gr.Row():
256
+ generation_mode = gr.Radio(
257
+ choices=[
258
+ (t("generation.mode_simple"), "simple"),
259
+ (t("generation.mode_custom"), "custom"),
260
+ ],
261
+ value="simple",
262
+ label=t("generation.mode_label"),
263
+ info=t("generation.mode_info"),
264
+ )
265
+
266
+ # Simple Mode Components - visible only in Simple mode
267
+ with gr.Group(visible=True) as simple_mode_group:
268
+ with gr.Row(equal_height=True):
269
+ simple_query_input = gr.Textbox(
270
+ label=t("generation.simple_query_label"),
271
+ placeholder=t("generation.simple_query_placeholder"),
272
+ lines=2,
273
+ info=t("generation.simple_query_info"),
274
+ scale=12,
275
+ )
276
+
277
+ with gr.Column(scale=1, min_width=100):
278
+ random_desc_btn = gr.Button(
279
+ "🎲",
280
+ variant="secondary",
281
+ size="sm",
282
+ scale=2
283
+ )
284
+
285
+ with gr.Row(equal_height=True):
286
+ with gr.Column(scale=1, variant="compact"):
287
+ simple_instrumental_checkbox = gr.Checkbox(
288
+ label=t("generation.instrumental_label"),
289
+ value=False,
290
+ )
291
+ with gr.Column(scale=18):
292
+ create_sample_btn = gr.Button(
293
+ t("generation.create_sample_btn"),
294
+ variant="primary",
295
+ size="lg",
296
+ )
297
+ with gr.Column(scale=1, variant="compact"):
298
+ simple_vocal_language = gr.Dropdown(
299
+ choices=VALID_LANGUAGES,
300
+ value="unknown",
301
+ allow_custom_value=True,
302
+ label=t("generation.simple_vocal_language_label"),
303
+ interactive=True,
304
+ )
305
+
306
+ # State to track if sample has been created in Simple mode
307
+ simple_sample_created = gr.State(value=False)
308
 
309
+ # Music Caption - wrapped in accordion that can be collapsed in Simple mode
310
+ with gr.Accordion(t("generation.caption_title"), open=False) as caption_accordion:
311
  with gr.Row(equal_height=True):
312
  captions = gr.Textbox(
313
  label=t("generation.caption_label"),
 
317
  scale=9,
318
  )
319
  sample_btn = gr.Button(
320
+ "🎲",
321
  variant="secondary",
322
  size="sm",
323
  scale=1,
324
  )
325
 
326
+ # Lyrics - wrapped in accordion that can be collapsed in Simple mode
327
+ with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
328
  lyrics = gr.Textbox(
329
  label=t("generation.lyrics_label"),
330
  placeholder=t("generation.lyrics_placeholder"),
 
338
  )
339
 
340
  # Optional Parameters
341
+ with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
342
  with gr.Row():
343
  vocal_language = gr.Dropdown(
344
  choices=VALID_LANGUAGES,
 
642
  "repainting_start": repainting_start,
643
  "repainting_end": repainting_end,
644
  "audio_cover_strength": audio_cover_strength,
645
+ # Simple/Custom Mode Components
646
+ "generation_mode": generation_mode,
647
+ "simple_mode_group": simple_mode_group,
648
+ "simple_query_input": simple_query_input,
649
+ "random_desc_btn": random_desc_btn,
650
+ "simple_instrumental_checkbox": simple_instrumental_checkbox,
651
+ "simple_vocal_language": simple_vocal_language,
652
+ "create_sample_btn": create_sample_btn,
653
+ "simple_sample_created": simple_sample_created,
654
+ "caption_accordion": caption_accordion,
655
+ "lyrics_accordion": lyrics_accordion,
656
+ "optional_params_accordion": optional_params_accordion,
657
+ # Existing components
658
  "captions": captions,
659
  "sample_btn": sample_btn,
660
  "load_file": load_file,
acestep/inference.py CHANGED
@@ -797,3 +797,188 @@ def understand_music(
797
  success=False,
798
  error=str(e),
799
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
  success=False,
798
  error=str(e),
799
  )
800
+
801
+
802
+ @dataclass
803
+ class CreateSampleResult:
804
+ """Result of creating a music sample from a natural language query.
805
+
806
+ This is used by the "Simple Mode" / "Inspiration Mode" feature where users
807
+ provide a natural language description and the LLM generates a complete
808
+ sample with caption, lyrics, and metadata.
809
+
810
+ Attributes:
811
+ # Metadata Fields
812
+ caption: Generated detailed music description/caption
813
+ lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
814
+ bpm: Beats per minute (None if not generated)
815
+ duration: Duration in seconds (None if not generated)
816
+ keyscale: Musical key (e.g., "C Major")
817
+ language: Vocal language code (e.g., "en", "zh")
818
+ timesignature: Time signature (e.g., "4")
819
+ instrumental: Whether this is an instrumental piece
820
+
821
+ # Status
822
+ status_message: Status message from sample creation
823
+ success: Whether sample creation completed successfully
824
+ error: Error message if sample creation failed
825
+ """
826
+ # Metadata Fields
827
+ caption: str = ""
828
+ lyrics: str = ""
829
+ bpm: Optional[int] = None
830
+ duration: Optional[float] = None
831
+ keyscale: str = ""
832
+ language: str = ""
833
+ timesignature: str = ""
834
+ instrumental: bool = False
835
+
836
+ # Status
837
+ status_message: str = ""
838
+ success: bool = True
839
+ error: Optional[str] = None
840
+
841
+ def to_dict(self) -> Dict[str, Any]:
842
+ """Convert result to dictionary for JSON serialization."""
843
+ return asdict(self)
844
+
845
+
846
+ def create_sample(
847
+ llm_handler,
848
+ query: str,
849
+ instrumental: bool = False,
850
+ vocal_language: Optional[List[str]] = None,
851
+ temperature: float = 0.85,
852
+ top_k: Optional[int] = None,
853
+ top_p: Optional[float] = None,
854
+ repetition_penalty: float = 1.0,
855
+ use_constrained_decoding: bool = True,
856
+ constrained_decoding_debug: bool = False,
857
+ ) -> CreateSampleResult:
858
+ """Create a music sample from a natural language query using the 5Hz Language Model.
859
+
860
+ This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
861
+ language description of music and generates a complete sample including:
862
+ - Detailed caption/description
863
+ - Lyrics (unless instrumental)
864
+ - Metadata (BPM, duration, key, language, time signature)
865
+
866
+ Note: cfg_scale and negative_prompt are not supported in create_sample mode.
867
+
868
+ Args:
869
+ llm_handler: Initialized LLM handler (LLMHandler instance)
870
+ query: User's natural language music description (e.g., "a soft Bengali love song")
871
+ instrumental: Whether to generate instrumental music (no vocals)
872
+ vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
873
+ If provided, the model will be constrained to generate lyrics in these languages.
874
+ If None or ["unknown"], no language constraint is applied.
875
+ temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
876
+ top_k: Top-K sampling (None or 0 = disabled)
877
+ top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
878
+ repetition_penalty: Repetition penalty (1.0 = no penalty)
879
+ use_constrained_decoding: Whether to use FSM-based constrained decoding
880
+ constrained_decoding_debug: Whether to enable debug logging
881
+
882
+ Returns:
883
+ CreateSampleResult with generated sample fields and status
884
+
885
+ Example:
886
+ >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
887
+ >>> if result.success:
888
+ ... print(f"Caption: {result.caption}")
889
+ ... print(f"Lyrics: {result.lyrics}")
890
+ ... print(f"BPM: {result.bpm}")
891
+ """
892
+ # Check if LLM is initialized
893
+ if not llm_handler.llm_initialized:
894
+ return CreateSampleResult(
895
+ status_message="5Hz LM not initialized. Please initialize it first.",
896
+ success=False,
897
+ error="LLM not initialized",
898
+ )
899
+
900
+ # Validate query
901
+ if not query or not query.strip():
902
+ return CreateSampleResult(
903
+ status_message="No query provided. Please enter a music description.",
904
+ success=False,
905
+ error="Empty query",
906
+ )
907
+
908
+ try:
909
+ # Call LLM to create sample
910
+ metadata, status = llm_handler.create_sample_from_query(
911
+ query=query,
912
+ instrumental=instrumental,
913
+ vocal_language=vocal_language,
914
+ temperature=temperature,
915
+ top_k=top_k,
916
+ top_p=top_p,
917
+ repetition_penalty=repetition_penalty,
918
+ use_constrained_decoding=use_constrained_decoding,
919
+ constrained_decoding_debug=constrained_decoding_debug,
920
+ )
921
+
922
+ # Check if LLM returned empty metadata (error case)
923
+ if not metadata:
924
+ return CreateSampleResult(
925
+ status_message=status or "Failed to create sample",
926
+ success=False,
927
+ error=status or "Empty metadata returned",
928
+ )
929
+
930
+ # Extract and convert fields
931
+ caption = metadata.get('caption', '')
932
+ lyrics = metadata.get('lyrics', '')
933
+ keyscale = metadata.get('keyscale', '')
934
+ language = metadata.get('language', metadata.get('vocal_language', ''))
935
+ timesignature = metadata.get('timesignature', '')
936
+ is_instrumental = metadata.get('instrumental', instrumental)
937
+
938
+ # Convert BPM to int
939
+ bpm = None
940
+ bpm_value = metadata.get('bpm')
941
+ if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
942
+ try:
943
+ bpm = int(bpm_value)
944
+ except (ValueError, TypeError):
945
+ pass
946
+
947
+ # Convert duration to float
948
+ duration = None
949
+ duration_value = metadata.get('duration')
950
+ if duration_value is not None and duration_value != 'N/A' and duration_value != '':
951
+ try:
952
+ duration = float(duration_value)
953
+ except (ValueError, TypeError):
954
+ pass
955
+
956
+ # Clean up N/A values
957
+ if keyscale == 'N/A':
958
+ keyscale = ''
959
+ if language == 'N/A':
960
+ language = ''
961
+ if timesignature == 'N/A':
962
+ timesignature = ''
963
+
964
+ return CreateSampleResult(
965
+ caption=caption,
966
+ lyrics=lyrics,
967
+ bpm=bpm,
968
+ duration=duration,
969
+ keyscale=keyscale,
970
+ language=language,
971
+ timesignature=timesignature,
972
+ instrumental=is_instrumental,
973
+ status_message=status,
974
+ success=True,
975
+ error=None,
976
+ )
977
+
978
+ except Exception as e:
979
+ logger.exception("Sample creation failed")
980
+ return CreateSampleResult(
981
+ status_message=f"Error: {str(e)}",
982
+ success=False,
983
+ error=str(e),
984
+ )
acestep/llm_inference.py CHANGED
@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
19
  RepetitionPenaltyLogitsProcessor,
20
  )
21
  from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
22
- from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION
23
 
24
 
25
  class LLMHandler:
@@ -308,7 +308,7 @@ class LLMHandler:
308
  if not os.path.exists(full_lm_model_path):
309
  return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
310
 
311
- logger.info("loading 5Hz LM tokenizer...")
312
  start_time = time.time()
313
  # TODO: load tokenizer too slow, not found solution yet
314
  llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
@@ -1433,6 +1433,185 @@ class LLMHandler:
1433
 
1434
  return after_think.strip()
1435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
  def generate_from_formatted_prompt(
1437
  self,
1438
  formatted_prompt: str,
 
19
  RepetitionPenaltyLogitsProcessor,
20
  )
21
  from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
22
+ from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
23
 
24
 
25
  class LLMHandler:
 
308
  if not os.path.exists(full_lm_model_path):
309
  return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
310
 
311
+ logger.info("loading 5Hz LM tokenizer... it may take 80~90s")
312
  start_time = time.time()
313
  # TODO: load tokenizer too slow, not found solution yet
314
  llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
 
1433
 
1434
  return after_think.strip()
1435
 
1436
+ def build_formatted_prompt_for_inspiration(
1437
+ self,
1438
+ query: str,
1439
+ instrumental: bool = False,
1440
+ is_negative_prompt: bool = False,
1441
+ negative_prompt: str = "NO USER INPUT"
1442
+ ) -> str:
1443
+ """
1444
+ Build the chat-formatted prompt for inspiration/simple mode.
1445
+
1446
+ This generates a complete sample (caption, lyrics, metadata) from a user's
1447
+ natural language music description query.
1448
+
1449
+ Args:
1450
+ query: User's natural language music description
1451
+ instrumental: Whether to generate instrumental music (no vocals)
1452
+ is_negative_prompt: If True, builds unconditional prompt for CFG
1453
+ negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
1454
+
1455
+ Returns:
1456
+ Formatted prompt string
1457
+
1458
+ Example:
1459
+ query = "a soft Bengali love song for a quiet evening"
1460
+ prompt = handler.build_formatted_prompt_for_inspiration(query, instrumental=False)
1461
+ """
1462
+ if self.llm_tokenizer is None:
1463
+ raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
1464
+
1465
+ # Build user content with query and instrumental flag
1466
+ instrumental_str = "true" if instrumental else "false"
1467
+
1468
+ if is_negative_prompt:
1469
+ # For CFG unconditional prompt
1470
+ user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
1471
+ else:
1472
+ # Normal prompt: query + instrumental flag
1473
+ user_content = f"{query}\n\ninstrumental: {instrumental_str}"
1474
+
1475
+ return self.llm_tokenizer.apply_chat_template(
1476
+ [
1477
+ {
1478
+ "role": "system",
1479
+ "content": f"# Instruction\n{DEFAULT_LM_INSPIRED_INSTRUCTION}\n\n"
1480
+ },
1481
+ {
1482
+ "role": "user",
1483
+ "content": user_content
1484
+ },
1485
+ ],
1486
+ tokenize=False,
1487
+ add_generation_prompt=True,
1488
+ )
1489
+
1490
+ def create_sample_from_query(
1491
+ self,
1492
+ query: str,
1493
+ instrumental: bool = False,
1494
+ vocal_language: Optional[List[str]] = None,
1495
+ temperature: float = 0.85,
1496
+ top_k: Optional[int] = None,
1497
+ top_p: Optional[float] = None,
1498
+ repetition_penalty: float = 1.0,
1499
+ use_constrained_decoding: bool = True,
1500
+ constrained_decoding_debug: bool = False,
1501
+ ) -> Tuple[Dict[str, Any], str]:
1502
+ """
1503
+ Create a complete music sample from a user's natural language query.
1504
+
1505
+ This is the "Simple Mode" / "Inspiration Mode" feature that generates:
1506
+ - Metadata (bpm, caption, duration, keyscale, language, timesignature)
1507
+ - Lyrics (unless instrumental=True)
1508
+
1509
+ Args:
1510
+ query: User's natural language music description
1511
+ instrumental: Whether to generate instrumental music (no vocals)
1512
+ vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
1513
+ If provided and not ["unknown"], the first language will be used.
1514
+ temperature: Sampling temperature for generation (0.0-2.0)
1515
+ top_k: Top-K sampling (None = disabled)
1516
+ top_p: Top-P (nucleus) sampling (None = disabled)
1517
+ repetition_penalty: Repetition penalty (1.0 = no penalty)
1518
+ use_constrained_decoding: Whether to use FSM-based constrained decoding
1519
+ constrained_decoding_debug: Whether to enable debug logging
1520
+
1521
+ Returns:
1522
+ Tuple of (metadata_dict, status_message)
1523
+ metadata_dict contains:
1524
+ - bpm: int or str
1525
+ - caption: str
1526
+ - duration: int or str
1527
+ - keyscale: str
1528
+ - language: str
1529
+ - timesignature: str
1530
+ - lyrics: str (extracted from output after </think>)
1531
+ - instrumental: bool (echoed back)
1532
+
1533
+ Example:
1534
+ query = "a soft Bengali love song for a quiet evening"
1535
+ metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
1536
+ print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
1537
+ print(metadata['lyrics']) # "[Intro: ...]\\n..."
1538
+ """
1539
+ if not getattr(self, "llm_initialized", False):
1540
+ return {}, "❌ 5Hz LM not initialized. Please initialize it first."
1541
+
1542
+ if not query or not query.strip():
1543
+ return {}, "❌ No query provided. Please enter a music description."
1544
+
1545
+ logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
1546
+
1547
+ # Build formatted prompt for inspiration
1548
+ formatted_prompt = self.build_formatted_prompt_for_inspiration(
1549
+ query=query,
1550
+ instrumental=instrumental,
1551
+ )
1552
+ logger.debug(f"Formatted prompt for inspiration: {formatted_prompt}")
1553
+
1554
+ # Build user_metadata if vocal_language is specified and is not "unknown"
1555
+ user_metadata = None
1556
+ skip_language = False
1557
+ if vocal_language and len(vocal_language) > 0:
1558
+ # Filter out "unknown" from the list
1559
+ valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
1560
+ if valid_languages:
1561
+ # Use the first valid language for constrained decoding
1562
+ user_metadata = {"language": valid_languages[0]}
1563
+ skip_language = True # Skip language generation since we're injecting it
1564
+ logger.info(f"Using user-specified language: {valid_languages[0]}")
1565
+
1566
+ # Generate using constrained decoding (inspiration phase)
1567
+ # Similar to understand mode - generate metadata first (CoT), then lyrics
1568
+ # Note: cfg_scale and negative_prompt are not used in create_sample mode
1569
+ output_text, status = self.generate_from_formatted_prompt(
1570
+ formatted_prompt=formatted_prompt,
1571
+ cfg={
1572
+ "temperature": temperature,
1573
+ "top_k": top_k,
1574
+ "top_p": top_p,
1575
+ "repetition_penalty": repetition_penalty,
1576
+ "target_duration": None, # No duration constraint
1577
+ "user_metadata": user_metadata, # Inject language if specified
1578
+ "skip_caption": False, # Generate caption
1579
+ "skip_language": skip_language, # Skip if we're injecting language
1580
+ "skip_genres": False, # Generate genres
1581
+ "generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
1582
+ "caption": "",
1583
+ "lyrics": "",
1584
+ },
1585
+ use_constrained_decoding=use_constrained_decoding,
1586
+ constrained_decoding_debug=constrained_decoding_debug,
1587
+ stop_at_reasoning=False, # Continue after </think> to generate lyrics
1588
+ )
1589
+
1590
+ if not output_text:
1591
+ return {}, status
1592
+
1593
+ # Parse metadata and extract lyrics
1594
+ metadata, _ = self.parse_lm_output(output_text)
1595
+
1596
+ # Extract lyrics section (everything after </think>)
1597
+ lyrics = self._extract_lyrics_from_output(output_text)
1598
+ if lyrics:
1599
+ metadata['lyrics'] = lyrics
1600
+ elif instrumental:
1601
+ # For instrumental, set empty lyrics or placeholder
1602
+ metadata['lyrics'] = "[Instrumental]"
1603
+
1604
+ # Echo back the instrumental flag
1605
+ metadata['instrumental'] = instrumental
1606
+
1607
+ logger.info(f"Sample created successfully. Generated {len(metadata)} fields")
1608
+ if constrained_decoding_debug:
1609
+ logger.debug(f"Generated metadata: {list(metadata.keys())}")
1610
+ logger.debug(f"Output text preview: {output_text[:300]}...")
1611
+
1612
+ status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
1613
+ return metadata, status_msg
1614
+
1615
  def generate_from_formatted_prompt(
1616
  self,
1617
  formatted_prompt: str,
examples/simple_mode/example_01.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "a soft Bengali love song for a quiet evening",
3
+ "instrumental": false,
4
+ "vocal_language": ["bn"]
5
+ }
examples/simple_mode/example_02.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "an upbeat summer pop song with catchy hooks",
3
+ "instrumental": false,
4
+ "vocal_language": ["en"]
5
+ }
examples/simple_mode/example_03.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "epic orchestral cinematic music for a movie trailer",
3
+ "instrumental": true,
4
+ "vocal_language": ["unknown"]
5
+ }
examples/simple_mode/example_04.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
3
+ "instrumental": false,
4
+ "vocal_language": ["zh"]
5
+ }
examples/simple_mode/example_05.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "Japanese city pop with nostalgic 80s vibes",
3
+ "instrumental": false,
4
+ "vocal_language": ["ja"]
5
+ }
examples/simple_mode/example_06.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "lo-fi hip hop beats for studying and relaxing",
3
+ "instrumental": true,
4
+ "vocal_language": ["unknown"]
5
+ }
examples/simple_mode/example_07.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "energetic K-pop dance track with powerful vocals",
3
+ "instrumental": false,
4
+ "vocal_language": ["ko"]
5
+ }
examples/simple_mode/example_08.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "romantic Spanish guitar ballad with heartfelt lyrics",
3
+ "instrumental": false,
4
+ "vocal_language": ["es"]
5
+ }
examples/simple_mode/example_09.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "中国风电子舞曲,融合古典乐器与现代节拍",
3
+ "instrumental": false,
4
+ "vocal_language": ["zh"]
5
+ }
examples/simple_mode/example_10.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "peaceful piano melody for meditation and relaxation",
3
+ "instrumental": true,
4
+ "vocal_language": ["unknown"]
5
+ }