ChuxiJ commited on
Commit
2b1ad1c
·
1 Parent(s): 2d3816e

test rewrite format

Browse files
.gitignore CHANGED
@@ -220,4 +220,5 @@ discord_bot/
220
  feishu_bot/
221
  tmp*
222
  torchinductor_root/
223
- scripts/
 
 
220
  feishu_bot/
221
  tmp*
222
  torchinductor_root/
223
+ scripts/
224
+ checkpoints_legacy/
acestep/gradio_ui/events/__init__.py CHANGED
@@ -190,6 +190,37 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
190
  outputs=[generation_section["lyrics"]]
191
  )
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # ========== Simple/Custom Mode Toggle ==========
194
  generation_section["generation_mode"].change(
195
  fn=gen_h.handle_generation_mode_change,
@@ -245,6 +276,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
245
  generation_section["audio_duration"],
246
  generation_section["key_scale"],
247
  generation_section["vocal_language"],
 
248
  generation_section["time_signature"],
249
  generation_section["instrumental_checkbox"],
250
  generation_section["caption_accordion"],
 
190
  outputs=[generation_section["lyrics"]]
191
  )
192
 
193
+ # ========== Format Button ==========
194
+ # Note: cfg_scale and negative_prompt are not supported in format mode
195
+ generation_section["format_btn"].click(
196
+ fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
197
+ llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
198
+ ),
199
+ inputs=[
200
+ generation_section["captions"],
201
+ generation_section["lyrics"],
202
+ generation_section["bpm"],
203
+ generation_section["audio_duration"],
204
+ generation_section["key_scale"],
205
+ generation_section["time_signature"],
206
+ generation_section["lm_temperature"],
207
+ generation_section["lm_top_k"],
208
+ generation_section["lm_top_p"],
209
+ generation_section["constrained_decoding_debug"],
210
+ ],
211
+ outputs=[
212
+ generation_section["captions"],
213
+ generation_section["lyrics"],
214
+ generation_section["bpm"],
215
+ generation_section["audio_duration"],
216
+ generation_section["key_scale"],
217
+ generation_section["vocal_language"],
218
+ generation_section["time_signature"],
219
+ results_section["is_format_caption_state"],
220
+ results_section["status_output"],
221
+ ]
222
+ )
223
+
224
  # ========== Simple/Custom Mode Toggle ==========
225
  generation_section["generation_mode"].change(
226
  fn=gen_h.handle_generation_mode_change,
 
276
  generation_section["audio_duration"],
277
  generation_section["key_scale"],
278
  generation_section["vocal_language"],
279
+ generation_section["simple_vocal_language"],
280
  generation_section["time_signature"],
281
  generation_section["instrumental_checkbox"],
282
  generation_section["caption_accordion"],
acestep/gradio_ui/events/generation_handlers.py CHANGED
@@ -13,7 +13,7 @@ from acestep.constants import (
13
  TASK_TYPES_BASE,
14
  )
15
  from acestep.gradio_ui.i18n import t
16
- from acestep.inference import understand_music, create_sample
17
 
18
 
19
  def load_metadata(file_obj):
@@ -256,7 +256,7 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
256
 
257
  def load_random_simple_description():
258
  """Load a random description from the simple_mode examples directory.
259
-
260
  Returns:
261
  Tuple of (description, instrumental, vocal_language) for updating UI components
262
  """
@@ -265,39 +265,39 @@ def load_random_simple_description():
265
  current_file = os.path.abspath(__file__)
266
  # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
267
  project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
268
-
269
  # Construct the examples directory path
270
  examples_dir = os.path.join(project_root, "examples", "simple_mode")
271
-
272
  # Check if directory exists
273
  if not os.path.exists(examples_dir):
274
  gr.Warning(t("messages.simple_examples_not_found"))
275
  return gr.update(), gr.update(), gr.update()
276
-
277
  # Find all JSON files in the directory
278
  json_files = glob.glob(os.path.join(examples_dir, "*.json"))
279
-
280
  if not json_files:
281
  gr.Warning(t("messages.simple_examples_empty"))
282
  return gr.update(), gr.update(), gr.update()
283
-
284
  # Randomly select one file
285
  selected_file = random.choice(json_files)
286
-
287
  # Read and parse JSON
288
  try:
289
  with open(selected_file, 'r', encoding='utf-8') as f:
290
  data = json.load(f)
291
-
292
  # Extract fields
293
  description = data.get('description', '')
294
  instrumental = data.get('instrumental', False)
295
- vocal_language = data.get('vocal_language', ['unknown'])
296
-
297
- # Ensure vocal_language is a list
298
- if isinstance(vocal_language, str):
299
- vocal_language = [vocal_language]
300
-
301
  gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
302
  return description, instrumental, vocal_language
303
 
@@ -564,7 +564,7 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
564
  def handle_simple_instrumental_change(is_instrumental: bool):
565
  """
566
  Handle simple mode instrumental checkbox changes.
567
- When checked: set vocal_language to ["unknown"] and disable editing.
568
  When unchecked: enable vocal_language editing.
569
 
570
  Args:
@@ -574,7 +574,7 @@ def handle_simple_instrumental_change(is_instrumental: bool):
574
  gr.update for simple_vocal_language dropdown
575
  """
576
  if is_instrumental:
577
- return gr.update(value=["unknown"], interactive=False)
578
  else:
579
  return gr.update(interactive=True)
580
 
@@ -653,7 +653,7 @@ def handle_create_sample(
653
  llm_handler,
654
  query: str,
655
  instrumental: bool,
656
- vocal_language: list,
657
  lm_temperature: float,
658
  lm_top_k: int,
659
  lm_top_p: float,
@@ -671,7 +671,7 @@ def handle_create_sample(
671
  llm_handler: LLM handler instance
672
  query: User's natural language music description
673
  instrumental: Whether to generate instrumental music
674
- vocal_language: List of preferred vocal languages for constrained decoding
675
  lm_temperature: LLM temperature for generation
676
  lm_top_k: LLM top-k sampling
677
  lm_top_p: LLM top-p sampling
@@ -695,27 +695,6 @@ def handle_create_sample(
695
  - is_format_caption_state (True)
696
  - status_output
697
  """
698
- # Validate query
699
- if not query or not query.strip():
700
- gr.Warning(t("messages.empty_query"))
701
- return (
702
- gr.update(), # captions - no change
703
- gr.update(), # lyrics - no change
704
- gr.update(), # bpm - no change
705
- gr.update(), # audio_duration - no change
706
- gr.update(), # key_scale - no change
707
- gr.update(), # vocal_language - no change
708
- gr.update(), # time_signature - no change
709
- gr.update(), # instrumental_checkbox - no change
710
- gr.update(), # caption_accordion - no change
711
- gr.update(), # lyrics_accordion - no change
712
- gr.update(interactive=False), # generate_btn - keep disabled
713
- False, # simple_sample_created - still False
714
- gr.update(), # think_checkbox - no change
715
- gr.update(), # is_format_caption_state - no change
716
- t("messages.empty_query"), # status_output
717
- )
718
-
719
  # Check if LLM is initialized
720
  if not llm_handler.llm_initialized:
721
  gr.Warning(t("messages.lm_not_initialized"))
@@ -765,6 +744,7 @@ def handle_create_sample(
765
  gr.update(), # audio_duration - no change
766
  gr.update(), # key_scale - no change
767
  gr.update(), # vocal_language - no change
 
768
  gr.update(), # time_signature - no change
769
  gr.update(), # instrumental_checkbox - no change
770
  gr.update(), # caption_accordion - no change
@@ -786,6 +766,7 @@ def handle_create_sample(
786
  result.duration if result.duration and result.duration > 0 else -1, # audio_duration
787
  result.keyscale, # key_scale
788
  result.language, # vocal_language
 
789
  result.timesignature, # time_signature
790
  result.instrumental, # instrumental_checkbox
791
  gr.update(open=True), # caption_accordion - expand
@@ -798,3 +779,125 @@ def handle_create_sample(
798
  )
799
 
800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  TASK_TYPES_BASE,
14
  )
15
  from acestep.gradio_ui.i18n import t
16
+ from acestep.inference import understand_music, create_sample, format_sample
17
 
18
 
19
  def load_metadata(file_obj):
 
256
 
257
  def load_random_simple_description():
258
  """Load a random description from the simple_mode examples directory.
259
+
260
  Returns:
261
  Tuple of (description, instrumental, vocal_language) for updating UI components
262
  """
 
265
  current_file = os.path.abspath(__file__)
266
  # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
267
  project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
268
+
269
  # Construct the examples directory path
270
  examples_dir = os.path.join(project_root, "examples", "simple_mode")
271
+
272
  # Check if directory exists
273
  if not os.path.exists(examples_dir):
274
  gr.Warning(t("messages.simple_examples_not_found"))
275
  return gr.update(), gr.update(), gr.update()
276
+
277
  # Find all JSON files in the directory
278
  json_files = glob.glob(os.path.join(examples_dir, "*.json"))
279
+
280
  if not json_files:
281
  gr.Warning(t("messages.simple_examples_empty"))
282
  return gr.update(), gr.update(), gr.update()
283
+
284
  # Randomly select one file
285
  selected_file = random.choice(json_files)
286
+
287
  # Read and parse JSON
288
  try:
289
  with open(selected_file, 'r', encoding='utf-8') as f:
290
  data = json.load(f)
291
+
292
  # Extract fields
293
  description = data.get('description', '')
294
  instrumental = data.get('instrumental', False)
295
+ vocal_language = data.get('vocal_language', 'unknown')
296
+
297
+ # Ensure vocal_language is a string
298
+ if isinstance(vocal_language, list):
299
+ vocal_language = vocal_language[0] if vocal_language else 'unknown'
300
+
301
  gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
302
  return description, instrumental, vocal_language
303
 
 
564
  def handle_simple_instrumental_change(is_instrumental: bool):
565
  """
566
  Handle simple mode instrumental checkbox changes.
567
+ When checked: set vocal_language to "unknown" and disable editing.
568
  When unchecked: enable vocal_language editing.
569
 
570
  Args:
 
574
  gr.update for simple_vocal_language dropdown
575
  """
576
  if is_instrumental:
577
+ return gr.update(value="unknown", interactive=False)
578
  else:
579
  return gr.update(interactive=True)
580
 
 
653
  llm_handler,
654
  query: str,
655
  instrumental: bool,
656
+ vocal_language: str,
657
  lm_temperature: float,
658
  lm_top_k: int,
659
  lm_top_p: float,
 
671
  llm_handler: LLM handler instance
672
  query: User's natural language music description
673
  instrumental: Whether to generate instrumental music
674
+ vocal_language: Preferred vocal language for constrained decoding
675
  lm_temperature: LLM temperature for generation
676
  lm_top_k: LLM top-k sampling
677
  lm_top_p: LLM top-p sampling
 
695
  - is_format_caption_state (True)
696
  - status_output
697
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  # Check if LLM is initialized
699
  if not llm_handler.llm_initialized:
700
  gr.Warning(t("messages.lm_not_initialized"))
 
744
  gr.update(), # audio_duration - no change
745
  gr.update(), # key_scale - no change
746
  gr.update(), # vocal_language - no change
747
+ gr.update(), # simple vocal_language - no change
748
  gr.update(), # time_signature - no change
749
  gr.update(), # instrumental_checkbox - no change
750
  gr.update(), # caption_accordion - no change
 
766
  result.duration if result.duration and result.duration > 0 else -1, # audio_duration
767
  result.keyscale, # key_scale
768
  result.language, # vocal_language
769
+ result.language, # simple vocal_language
770
  result.timesignature, # time_signature
771
  result.instrumental, # instrumental_checkbox
772
  gr.update(open=True), # caption_accordion - expand
 
779
  )
780
 
781
 
782
+ def handle_format_sample(
783
+ llm_handler,
784
+ caption: str,
785
+ lyrics: str,
786
+ bpm,
787
+ audio_duration,
788
+ key_scale: str,
789
+ time_signature: str,
790
+ lm_temperature: float,
791
+ lm_top_k: int,
792
+ lm_top_p: float,
793
+ constrained_decoding_debug: bool = False,
794
+ ):
795
+ """
796
+ Handle the Format button click to format caption and lyrics.
797
+
798
+ Takes user-provided caption and lyrics, and uses the LLM to generate
799
+ structured music metadata and an enhanced description.
800
+
801
+ Note: cfg_scale and negative_prompt are not supported in format mode.
802
+
803
+ Args:
804
+ llm_handler: LLM handler instance
805
+ caption: User's caption/description
806
+ lyrics: User's lyrics
807
+ bpm: User-provided BPM (optional, for constrained decoding)
808
+ audio_duration: User-provided duration (optional, for constrained decoding)
809
+ key_scale: User-provided key scale (optional, for constrained decoding)
810
+ time_signature: User-provided time signature (optional, for constrained decoding)
811
+ lm_temperature: LLM temperature for generation
812
+ lm_top_k: LLM top-k sampling
813
+ lm_top_p: LLM top-p sampling
814
+ constrained_decoding_debug: Whether to enable debug logging
815
+
816
+ Returns:
817
+ Tuple of updates for:
818
+ - captions
819
+ - lyrics
820
+ - bpm
821
+ - audio_duration
822
+ - key_scale
823
+ - vocal_language
824
+ - time_signature
825
+ - is_format_caption_state
826
+ - status_output
827
+ """
828
+ # Check if LLM is initialized
829
+ if not llm_handler.llm_initialized:
830
+ gr.Warning(t("messages.lm_not_initialized"))
831
+ return (
832
+ gr.update(), # captions - no change
833
+ gr.update(), # lyrics - no change
834
+ gr.update(), # bpm - no change
835
+ gr.update(), # audio_duration - no change
836
+ gr.update(), # key_scale - no change
837
+ gr.update(), # vocal_language - no change
838
+ gr.update(), # time_signature - no change
839
+ gr.update(), # is_format_caption_state - no change
840
+ t("messages.lm_not_initialized"), # status_output
841
+ )
842
+
843
+ # Build user_metadata from provided values for constrained decoding
844
+ user_metadata = {}
845
+ if bpm is not None and bpm > 0:
846
+ user_metadata['bpm'] = int(bpm)
847
+ if audio_duration is not None and audio_duration > 0:
848
+ user_metadata['duration'] = int(audio_duration)
849
+ if key_scale and key_scale.strip():
850
+ user_metadata['keyscale'] = key_scale.strip()
851
+ if time_signature and time_signature.strip():
852
+ user_metadata['timesignature'] = time_signature.strip()
853
+
854
+ # Only pass user_metadata if we have at least one field
855
+ user_metadata_to_pass = user_metadata if user_metadata else None
856
+
857
+ # Convert LM parameters
858
+ top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
859
+ top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
860
+
861
+ # Call format_sample API
862
+ result = format_sample(
863
+ llm_handler=llm_handler,
864
+ caption=caption,
865
+ lyrics=lyrics,
866
+ user_metadata=user_metadata_to_pass,
867
+ temperature=lm_temperature,
868
+ top_k=top_k_value,
869
+ top_p=top_p_value,
870
+ use_constrained_decoding=True,
871
+ constrained_decoding_debug=constrained_decoding_debug,
872
+ )
873
+
874
+ # Handle error
875
+ if not result.success:
876
+ gr.Warning(result.status_message or t("messages.format_failed"))
877
+ return (
878
+ gr.update(), # captions - no change
879
+ gr.update(), # lyrics - no change
880
+ gr.update(), # bpm - no change
881
+ gr.update(), # audio_duration - no change
882
+ gr.update(), # key_scale - no change
883
+ gr.update(), # vocal_language - no change
884
+ gr.update(), # time_signature - no change
885
+ gr.update(), # is_format_caption_state - no change
886
+ result.status_message or t("messages.format_failed"), # status_output
887
+ )
888
+
889
+ # Success - populate fields
890
+ gr.Info(t("messages.format_success"))
891
+
892
+ return (
893
+ result.caption, # captions
894
+ result.lyrics, # lyrics
895
+ result.bpm, # bpm
896
+ result.duration if result.duration and result.duration > 0 else -1, # audio_duration
897
+ result.keyscale, # key_scale
898
+ result.language, # vocal_language
899
+ result.timesignature, # time_signature
900
+ True, # is_format_caption_state - True (LM-formatted)
901
+ result.status_message, # status_output
902
+ )
903
+
acestep/gradio_ui/events/results_handlers.py CHANGED
@@ -465,6 +465,14 @@ def generate_with_progress(
465
  ):
466
  """Generate audio with progress tracking"""
467
 
 
 
 
 
 
 
 
 
468
  # step 1: prepare inputs
469
  # generate_music, GenerationParams, GenerationConfig
470
  gen_params = GenerationParams(
@@ -496,7 +504,7 @@ def generate_with_progress(
496
  lm_top_k=lm_top_k,
497
  lm_top_p=lm_top_p,
498
  lm_negative_prompt=lm_negative_prompt,
499
- use_cot_metas=use_cot_metas,
500
  use_cot_caption=use_cot_caption,
501
  use_cot_language=use_cot_language,
502
  use_constrained_decoding=True,
@@ -587,7 +595,7 @@ def generate_with_progress(
587
  # Clear lrc_display with empty string - this triggers .change() to clear subtitles
588
  clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
589
  clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
590
- dump_audio = [None for _ in range(8)]
591
  yield (
592
  # Audio outputs - just skip, value will be updated in loop
593
  # Subtitles will be cleared via lrc_display.change()
@@ -1682,6 +1690,8 @@ def generate_next_batch_background(
1682
 
1683
  # Call generate_with_progress with the saved parameters
1684
  # Note: generate_with_progress is a generator, need to iterate through it
 
 
1685
  generator = generate_with_progress(
1686
  dit_handler,
1687
  llm_handler,
@@ -1719,7 +1729,7 @@ def generate_next_batch_background(
1719
  use_cot_metas=params.get("use_cot_metas"),
1720
  use_cot_caption=params.get("use_cot_caption"),
1721
  use_cot_language=params.get("use_cot_language"),
1722
- is_format_caption=is_format_caption,
1723
  constrained_decoding_debug=params.get("constrained_decoding_debug"),
1724
  allow_lm_batch=params.get("allow_lm_batch"),
1725
  auto_score=params.get("auto_score"),
 
465
  ):
466
  """Generate audio with progress tracking"""
467
 
468
+ # Skip Phase 1 metas COT if sample is already formatted (from LLM/file/random)
469
+ # This avoids redundant LLM calls since metas (bpm, keyscale, etc.) are already generated
470
+ actual_use_cot_metas = use_cot_metas
471
+ if is_format_caption and use_cot_metas:
472
+ actual_use_cot_metas = False
473
+ logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
474
+ gr.Info(t("messages.skipping_metas_cot"))
475
+
476
  # step 1: prepare inputs
477
  # generate_music, GenerationParams, GenerationConfig
478
  gen_params = GenerationParams(
 
504
  lm_top_k=lm_top_k,
505
  lm_top_p=lm_top_p,
506
  lm_negative_prompt=lm_negative_prompt,
507
+ use_cot_metas=actual_use_cot_metas,
508
  use_cot_caption=use_cot_caption,
509
  use_cot_language=use_cot_language,
510
  use_constrained_decoding=True,
 
595
  # Clear lrc_display with empty string - this triggers .change() to clear subtitles
596
  clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
597
  clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
598
+ dump_audio = [gr.update(value="", subtitles="") for _ in range(8)]
599
  yield (
600
  # Audio outputs - just skip, value will be updated in loop
601
  # Subtitles will be cleared via lrc_display.change()
 
1690
 
1691
  # Call generate_with_progress with the saved parameters
1692
  # Note: generate_with_progress is a generator, need to iterate through it
1693
+ # For AutoGen background batches, always skip metas COT since we want to
1694
+ # generate NEW audio codes with new seeds, not regenerate the same metas
1695
  generator = generate_with_progress(
1696
  dit_handler,
1697
  llm_handler,
 
1729
  use_cot_metas=params.get("use_cot_metas"),
1730
  use_cot_caption=params.get("use_cot_caption"),
1731
  use_cot_language=params.get("use_cot_language"),
1732
+ is_format_caption=is_format_caption, # Pass through - will skip metas COT if True
1733
  constrained_decoding_debug=params.get("constrained_decoding_debug"),
1734
  allow_lm_batch=params.get("allow_lm_batch"),
1735
  auto_score=params.get("auto_score"),
acestep/gradio_ui/i18n/en.json CHANGED
@@ -84,7 +84,7 @@
84
  "mode_simple": "Simple",
85
  "mode_custom": "Custom",
86
  "simple_query_label": "Song Description",
87
- "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
88
  "simple_query_info": "Enter a natural language description of the music you want to generate",
89
  "simple_vocal_language_label": "Vocal Language (optional)",
90
  "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
@@ -98,6 +98,7 @@
98
  "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
99
  "lyrics_info": "Song lyrics with structure",
100
  "instrumental_label": "Instrumental",
 
101
  "optional_params": "⚙️ Optional Parameters",
102
  "vocal_language_label": "Vocal Language (optional)",
103
  "vocal_language_info": "use `unknown` for inst",
@@ -227,6 +228,9 @@
227
  "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
228
  "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
229
  "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
230
- "simple_example_loaded": "🎲 Loaded random example from {filename}"
 
 
 
231
  }
232
  }
 
84
  "mode_simple": "Simple",
85
  "mode_custom": "Custom",
86
  "simple_query_label": "Song Description",
87
+ "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
88
  "simple_query_info": "Enter a natural language description of the music you want to generate",
89
  "simple_vocal_language_label": "Vocal Language (optional)",
90
  "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
 
98
  "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
99
  "lyrics_info": "Song lyrics with structure",
100
  "instrumental_label": "Instrumental",
101
+ "format_btn": "Format",
102
  "optional_params": "⚙️ Optional Parameters",
103
  "vocal_language_label": "Vocal Language (optional)",
104
  "vocal_language_info": "use `unknown` for inst",
 
228
  "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
229
  "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
230
  "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
231
+ "simple_example_loaded": "🎲 Loaded random example from {filename}",
232
+ "format_success": "✅ Caption and lyrics formatted successfully",
233
+ "format_failed": "❌ Format failed: {error}",
234
+ "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
235
  }
236
  }
acestep/gradio_ui/i18n/ja.json CHANGED
@@ -84,7 +84,7 @@
84
  "mode_simple": "シンプル",
85
  "mode_custom": "カスタム",
86
  "simple_query_label": "曲の説明",
87
- "simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'",
88
  "simple_query_info": "生成したい音楽の自然言語の説明を入力",
89
  "simple_vocal_language_label": "ボーカル言語(オプション)",
90
  "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
@@ -98,6 +98,7 @@
98
  "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
99
  "lyrics_info": "構造を持つ曲の歌詞",
100
  "instrumental_label": "インストゥルメンタル",
 
101
  "optional_params": "⚙️ オプションパラメータ",
102
  "vocal_language_label": "ボーカル言語(オプション)",
103
  "vocal_language_info": "インストには`unknown`を使用",
@@ -227,6 +228,9 @@
227
  "sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
228
  "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
229
  "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
230
- "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
 
 
 
231
  }
232
  }
 
84
  "mode_simple": "シンプル",
85
  "mode_custom": "カスタム",
86
  "simple_query_label": "曲の説明",
87
+ "simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
88
  "simple_query_info": "生成したい音楽の自然言語の説明を入力",
89
  "simple_vocal_language_label": "ボーカル言語(オプション)",
90
  "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
 
98
  "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
99
  "lyrics_info": "構造を持つ曲の歌詞",
100
  "instrumental_label": "インストゥルメンタル",
101
+ "format_btn": "フォーマット",
102
  "optional_params": "⚙️ オプションパラメータ",
103
  "vocal_language_label": "ボーカル言語(オプション)",
104
  "vocal_language_info": "インストには`unknown`を使用",
 
228
  "sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
229
  "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
230
  "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
231
+ "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
232
+ "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
233
+ "format_failed": "❌ フォーマットに失敗しました: {error}",
234
+ "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ(サンプルは既にフォーマット済み)"
235
  }
236
  }
acestep/gradio_ui/i18n/zh.json CHANGED
@@ -84,7 +84,7 @@
84
  "mode_simple": "简单",
85
  "mode_custom": "自定义",
86
  "simple_query_label": "歌曲描述",
87
- "simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'",
88
  "simple_query_info": "输入你想生成的音乐的自然语言描述",
89
  "simple_vocal_language_label": "人声语言(可选)",
90
  "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
@@ -98,6 +98,7 @@
98
  "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
99
  "lyrics_info": "带有结构的歌曲歌词",
100
  "instrumental_label": "纯音乐",
 
101
  "optional_params": "⚙️ 可选参数",
102
  "vocal_language_label": "人声语言(可选)",
103
  "vocal_language_info": "纯音乐使用 `unknown`",
@@ -227,6 +228,9 @@
227
  "sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
228
  "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
229
  "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
230
- "simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
 
 
 
231
  }
232
  }
 
84
  "mode_simple": "简单",
85
  "mode_custom": "自定义",
86
  "simple_query_label": "歌曲描述",
87
+ "simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'。留空则随机生成样本。",
88
  "simple_query_info": "输入你想生成的音乐的自然语言描述",
89
  "simple_vocal_language_label": "人声语言(可选)",
90
  "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
 
98
  "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
99
  "lyrics_info": "带有结构的歌曲歌词",
100
  "instrumental_label": "纯音乐",
101
+ "format_btn": "格式化",
102
  "optional_params": "⚙️ 可选参数",
103
  "vocal_language_label": "人声语言(可选)",
104
  "vocal_language_info": "纯音乐使用 `unknown`",
 
228
  "sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
229
  "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
230
  "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
231
+ "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
232
+ "format_success": "✅ 描述和歌词格式化成功",
233
+ "format_failed": "❌ 格式化失败: {error}",
234
+ "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT(样本已格式化)"
235
  }
236
  }
acestep/gradio_ui/interfaces/generation.py CHANGED
@@ -314,15 +314,15 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
314
  placeholder=t("generation.caption_placeholder"),
315
  lines=3,
316
  info=t("generation.caption_info"),
317
- scale=9,
318
  )
319
- sample_btn = gr.Button(
320
- "🎲",
321
- variant="secondary",
322
- size="sm",
323
- scale=1,
324
- )
325
-
326
  # Lyrics - wrapped in accordion that can be collapsed in Simple mode
327
  with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
328
  lyrics = gr.Textbox(
@@ -331,22 +331,40 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
331
  lines=8,
332
  info=t("generation.lyrics_info")
333
  )
334
- instrumental_checkbox = gr.Checkbox(
335
- label=t("generation.instrumental_label"),
336
- value=False,
337
- scale=1,
338
- )
339
-
340
- # Optional Parameters
341
- with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
342
- with gr.Row():
 
 
 
343
  vocal_language = gr.Dropdown(
344
  choices=VALID_LANGUAGES,
345
  value="unknown",
346
  label=t("generation.vocal_language_label"),
 
 
347
  allow_custom_value=True,
348
- info=t("generation.vocal_language_info")
 
 
 
 
 
 
 
 
 
349
  )
 
 
 
 
350
  bpm = gr.Number(
351
  label=t("generation.bpm_label"),
352
  value=None,
@@ -679,6 +697,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
679
  "autogen_checkbox": autogen_checkbox,
680
  "generate_btn": generate_btn,
681
  "instrumental_checkbox": instrumental_checkbox,
 
682
  "constrained_decoding_debug": constrained_decoding_debug,
683
  "score_scale": score_scale,
684
  "allow_lm_batch": allow_lm_batch,
 
314
  placeholder=t("generation.caption_placeholder"),
315
  lines=3,
316
  info=t("generation.caption_info"),
317
+ scale=12,
318
  )
319
+ with gr.Column(scale=1, min_width=100):
320
+ sample_btn = gr.Button(
321
+ "🎲",
322
+ variant="secondary",
323
+ size="sm",
324
+ scale=2,
325
+ )
326
  # Lyrics - wrapped in accordion that can be collapsed in Simple mode
327
  with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
328
  lyrics = gr.Textbox(
 
331
  lines=8,
332
  info=t("generation.lyrics_info")
333
  )
334
+
335
+ with gr.Row(variant="compact", equal_height=True):
336
+ instrumental_checkbox = gr.Checkbox(
337
+ label=t("generation.instrumental_label"),
338
+ value=False,
339
+ scale=1,
340
+ min_width=120,
341
+ container=True,
342
+ )
343
+
344
+ # 中间:语言选择 (Dropdown)
345
+ # 移除 gr.HTML hack,直接使用 label 参数,Gradio 会自动处理对齐
346
  vocal_language = gr.Dropdown(
347
  choices=VALID_LANGUAGES,
348
  value="unknown",
349
  label=t("generation.vocal_language_label"),
350
+ show_label=False,
351
+ container=True,
352
  allow_custom_value=True,
353
+ scale=3,
354
+ )
355
+
356
+ # 右侧:格式化按钮 (Button)
357
+ # 放在同一行最右侧,操作更顺手
358
+ format_btn = gr.Button(
359
+ t("generation.format_btn"),
360
+ variant="secondary",
361
+ scale=1,
362
+ min_width=80,
363
  )
364
+
365
+ # Optional Parameters
366
+ with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
367
+ with gr.Row():
368
  bpm = gr.Number(
369
  label=t("generation.bpm_label"),
370
  value=None,
 
697
  "autogen_checkbox": autogen_checkbox,
698
  "generate_btn": generate_btn,
699
  "instrumental_checkbox": instrumental_checkbox,
700
+ "format_btn": format_btn,
701
  "constrained_decoding_debug": constrained_decoding_debug,
702
  "score_scale": score_scale,
703
  "allow_lm_batch": allow_lm_batch,
acestep/inference.py CHANGED
@@ -671,8 +671,6 @@ def understand_music(
671
  llm_handler,
672
  audio_codes: str,
673
  temperature: float = 0.85,
674
- cfg_scale: float = 1.0,
675
- negative_prompt: str = "NO USER INPUT",
676
  top_k: Optional[int] = None,
677
  top_p: Optional[float] = None,
678
  repetition_penalty: float = 1.0,
@@ -687,13 +685,13 @@ def understand_music(
687
  If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
688
  instead of analyzing existing codes.
689
 
 
 
690
  Args:
691
  llm_handler: Initialized LLM handler (LLMHandler instance)
692
  audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
693
  Use empty string or "NO USER INPUT" to generate a sample example.
694
  temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
695
- cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
696
- negative_prompt: Negative prompt for CFG guidance
697
  top_k: Top-K sampling (None or 0 = disabled)
698
  top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
699
  repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -727,8 +725,6 @@ def understand_music(
727
  metadata, status = llm_handler.understand_audio_from_codes(
728
  audio_codes=audio_codes,
729
  temperature=temperature,
730
- cfg_scale=cfg_scale,
731
- negative_prompt=negative_prompt,
732
  top_k=top_k,
733
  top_p=top_p,
734
  repetition_penalty=repetition_penalty,
@@ -847,7 +843,7 @@ def create_sample(
847
  llm_handler,
848
  query: str,
849
  instrumental: bool = False,
850
- vocal_language: Optional[List[str]] = None,
851
  temperature: float = 0.85,
852
  top_k: Optional[int] = None,
853
  top_p: Optional[float] = None,
@@ -869,9 +865,9 @@ def create_sample(
869
  llm_handler: Initialized LLM handler (LLMHandler instance)
870
  query: User's natural language music description (e.g., "a soft Bengali love song")
871
  instrumental: Whether to generate instrumental music (no vocals)
872
- vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
873
- If provided, the model will be constrained to generate lyrics in these languages.
874
- If None or ["unknown"], no language constraint is applied.
875
  temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
876
  top_k: Top-K sampling (None or 0 = disabled)
877
  top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
@@ -883,7 +879,7 @@ def create_sample(
883
  CreateSampleResult with generated sample fields and status
884
 
885
  Example:
886
- >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
887
  >>> if result.success:
888
  ... print(f"Caption: {result.caption}")
889
  ... print(f"Lyrics: {result.lyrics}")
@@ -897,14 +893,6 @@ def create_sample(
897
  error="LLM not initialized",
898
  )
899
 
900
- # Validate query
901
- if not query or not query.strip():
902
- return CreateSampleResult(
903
- status_message="No query provided. Please enter a music description.",
904
- success=False,
905
- error="Empty query",
906
- )
907
-
908
  try:
909
  # Call LLM to create sample
910
  metadata, status = llm_handler.create_sample_from_query(
@@ -982,3 +970,175 @@ def create_sample(
982
  success=False,
983
  error=str(e),
984
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  llm_handler,
672
  audio_codes: str,
673
  temperature: float = 0.85,
 
 
674
  top_k: Optional[int] = None,
675
  top_p: Optional[float] = None,
676
  repetition_penalty: float = 1.0,
 
685
  If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
686
  instead of analyzing existing codes.
687
 
688
+ Note: cfg_scale and negative_prompt are not supported in understand mode.
689
+
690
  Args:
691
  llm_handler: Initialized LLM handler (LLMHandler instance)
692
  audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
693
  Use empty string or "NO USER INPUT" to generate a sample example.
694
  temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
 
 
695
  top_k: Top-K sampling (None or 0 = disabled)
696
  top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
697
  repetition_penalty: Repetition penalty (1.0 = no penalty)
 
725
  metadata, status = llm_handler.understand_audio_from_codes(
726
  audio_codes=audio_codes,
727
  temperature=temperature,
 
 
728
  top_k=top_k,
729
  top_p=top_p,
730
  repetition_penalty=repetition_penalty,
 
843
  llm_handler,
844
  query: str,
845
  instrumental: bool = False,
846
+ vocal_language: Optional[str] = None,
847
  temperature: float = 0.85,
848
  top_k: Optional[int] = None,
849
  top_p: Optional[float] = None,
 
865
  llm_handler: Initialized LLM handler (LLMHandler instance)
866
  query: User's natural language music description (e.g., "a soft Bengali love song")
867
  instrumental: Whether to generate instrumental music (no vocals)
868
+ vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
869
+ If provided, the model will be constrained to generate lyrics in this language.
870
+ If None or "unknown", no language constraint is applied.
871
  temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
872
  top_k: Top-K sampling (None or 0 = disabled)
873
  top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
 
879
  CreateSampleResult with generated sample fields and status
880
 
881
  Example:
882
+ >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
883
  >>> if result.success:
884
  ... print(f"Caption: {result.caption}")
885
  ... print(f"Lyrics: {result.lyrics}")
 
893
  error="LLM not initialized",
894
  )
895
 
 
 
 
 
 
 
 
 
896
  try:
897
  # Call LLM to create sample
898
  metadata, status = llm_handler.create_sample_from_query(
 
970
  success=False,
971
  error=str(e),
972
  )
973
+
974
+
975
+ @dataclass
976
+ class FormatSampleResult:
977
+ """Result of formatting user-provided caption and lyrics.
978
+
979
+ This is used by the "Format" feature where users provide caption and lyrics,
980
+ and the LLM formats them into structured music metadata and an enhanced description.
981
+
982
+ Attributes:
983
+ # Metadata Fields
984
+ caption: Enhanced/formatted music description/caption
985
+ lyrics: Formatted lyrics (may be same as input or reformatted)
986
+ bpm: Beats per minute (None if not detected)
987
+ duration: Duration in seconds (None if not detected)
988
+ keyscale: Musical key (e.g., "C Major")
989
+ language: Vocal language code (e.g., "en", "zh")
990
+ timesignature: Time signature (e.g., "4")
991
+
992
+ # Status
993
+ status_message: Status message from formatting
994
+ success: Whether formatting completed successfully
995
+ error: Error message if formatting failed
996
+ """
997
+ # Metadata Fields
998
+ caption: str = ""
999
+ lyrics: str = ""
1000
+ bpm: Optional[int] = None
1001
+ duration: Optional[float] = None
1002
+ keyscale: str = ""
1003
+ language: str = ""
1004
+ timesignature: str = ""
1005
+
1006
+ # Status
1007
+ status_message: str = ""
1008
+ success: bool = True
1009
+ error: Optional[str] = None
1010
+
1011
+ def to_dict(self) -> Dict[str, Any]:
1012
+ """Convert result to dictionary for JSON serialization."""
1013
+ return asdict(self)
1014
+
1015
+
1016
+ def format_sample(
1017
+ llm_handler,
1018
+ caption: str,
1019
+ lyrics: str,
1020
+ user_metadata: Optional[Dict[str, Any]] = None,
1021
+ temperature: float = 0.85,
1022
+ top_k: Optional[int] = None,
1023
+ top_p: Optional[float] = None,
1024
+ repetition_penalty: float = 1.0,
1025
+ use_constrained_decoding: bool = True,
1026
+ constrained_decoding_debug: bool = False,
1027
+ ) -> FormatSampleResult:
1028
+ """Format user-provided caption and lyrics using the 5Hz Language Model.
1029
+
1030
+ This function takes user input (caption and lyrics) and generates structured
1031
+ music metadata including an enhanced caption, BPM, duration, key, language,
1032
+ and time signature.
1033
+
1034
+ If user_metadata is provided, those values will be used to constrain the
1035
+ decoding, ensuring the output matches user-specified values.
1036
+
1037
+ Note: cfg_scale and negative_prompt are not supported in format mode.
1038
+
1039
+ Args:
1040
+ llm_handler: Initialized LLM handler (LLMHandler instance)
1041
+ caption: User's caption/description (e.g., "Latin pop, reggaeton")
1042
+ lyrics: User's lyrics with structure tags
1043
+ user_metadata: Optional dict with user-provided metadata to constrain decoding.
1044
+ Supported keys: bpm, duration, keyscale, timesignature, language
1045
+ temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
1046
+ top_k: Top-K sampling (None or 0 = disabled)
1047
+ top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
1048
+ repetition_penalty: Repetition penalty (1.0 = no penalty)
1049
+ use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
1050
+ constrained_decoding_debug: Whether to enable debug logging for constrained decoding
1051
+
1052
+ Returns:
1053
+ FormatSampleResult with formatted metadata fields and status
1054
+
1055
+ Example:
1056
+ >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
1057
+ >>> if result.success:
1058
+ ... print(f"Caption: {result.caption}")
1059
+ ... print(f"BPM: {result.bpm}")
1060
+ ... print(f"Lyrics: {result.lyrics}")
1061
+ """
1062
+ # Check if LLM is initialized
1063
+ if not llm_handler.llm_initialized:
1064
+ return FormatSampleResult(
1065
+ status_message="5Hz LM not initialized. Please initialize it first.",
1066
+ success=False,
1067
+ error="LLM not initialized",
1068
+ )
1069
+
1070
+ try:
1071
+ # Call LLM formatting
1072
+ metadata, status = llm_handler.format_sample_from_input(
1073
+ caption=caption,
1074
+ lyrics=lyrics,
1075
+ user_metadata=user_metadata,
1076
+ temperature=temperature,
1077
+ top_k=top_k,
1078
+ top_p=top_p,
1079
+ repetition_penalty=repetition_penalty,
1080
+ use_constrained_decoding=use_constrained_decoding,
1081
+ constrained_decoding_debug=constrained_decoding_debug,
1082
+ )
1083
+
1084
+ # Check if LLM returned empty metadata (error case)
1085
+ if not metadata:
1086
+ return FormatSampleResult(
1087
+ status_message=status or "Failed to format input",
1088
+ success=False,
1089
+ error=status or "Empty metadata returned",
1090
+ )
1091
+
1092
+ # Extract and convert fields
1093
+ result_caption = metadata.get('caption', '')
1094
+ result_lyrics = metadata.get('lyrics', lyrics) # Fall back to input lyrics
1095
+ keyscale = metadata.get('keyscale', '')
1096
+ language = metadata.get('language', metadata.get('vocal_language', ''))
1097
+ timesignature = metadata.get('timesignature', '')
1098
+
1099
+ # Convert BPM to int
1100
+ bpm = None
1101
+ bpm_value = metadata.get('bpm')
1102
+ if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
1103
+ try:
1104
+ bpm = int(bpm_value)
1105
+ except (ValueError, TypeError):
1106
+ pass
1107
+
1108
+ # Convert duration to float
1109
+ duration = None
1110
+ duration_value = metadata.get('duration')
1111
+ if duration_value is not None and duration_value != 'N/A' and duration_value != '':
1112
+ try:
1113
+ duration = float(duration_value)
1114
+ except (ValueError, TypeError):
1115
+ pass
1116
+
1117
+ # Clean up N/A values
1118
+ if keyscale == 'N/A':
1119
+ keyscale = ''
1120
+ if language == 'N/A':
1121
+ language = ''
1122
+ if timesignature == 'N/A':
1123
+ timesignature = ''
1124
+
1125
+ return FormatSampleResult(
1126
+ caption=result_caption,
1127
+ lyrics=result_lyrics,
1128
+ bpm=bpm,
1129
+ duration=duration,
1130
+ keyscale=keyscale,
1131
+ language=language,
1132
+ timesignature=timesignature,
1133
+ status_message=status,
1134
+ success=True,
1135
+ error=None,
1136
+ )
1137
+
1138
+ except Exception as e:
1139
+ logger.exception("Format sample failed")
1140
+ return FormatSampleResult(
1141
+ status_message=f"Error: {str(e)}",
1142
+ success=False,
1143
+ error=str(e),
1144
+ )
acestep/llm_inference.py CHANGED
@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
19
  RepetitionPenaltyLogitsProcessor,
20
  )
21
  from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
22
- from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
23
 
24
 
25
  class LLMHandler:
@@ -1296,8 +1296,6 @@ class LLMHandler:
1296
  self,
1297
  audio_codes: str,
1298
  temperature: float = 0.3,
1299
- cfg_scale: float = 1.0,
1300
- negative_prompt: str = "NO USER INPUT",
1301
  top_k: Optional[int] = None,
1302
  top_p: Optional[float] = None,
1303
  repetition_penalty: float = 1.0,
@@ -1306,16 +1304,16 @@ class LLMHandler:
1306
  ) -> Tuple[Dict[str, Any], str]:
1307
  """
1308
  Understand audio codes and generate metadata + lyrics.
1309
-
1310
  This is the reverse of the normal generation flow:
1311
  - Input: Audio codes
1312
  - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
1313
-
 
 
1314
  Args:
1315
  audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
1316
  temperature: Sampling temperature for generation
1317
- cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
1318
- negative_prompt: Negative prompt for CFG
1319
  top_k: Top-K sampling (None = disabled)
1320
  top_p: Top-P (nucleus) sampling (None = disabled)
1321
  repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -1352,12 +1350,11 @@ class LLMHandler:
1352
  print(f"formatted_prompt: {formatted_prompt}")
1353
  # Generate using constrained decoding (understand phase)
1354
  # We want to generate metadata first (CoT), then lyrics (natural text)
 
1355
  output_text, status = self.generate_from_formatted_prompt(
1356
  formatted_prompt=formatted_prompt,
1357
  cfg={
1358
  "temperature": temperature,
1359
- "cfg_scale": cfg_scale,
1360
- "negative_prompt": negative_prompt,
1361
  "top_k": top_k,
1362
  "top_p": top_p,
1363
  "repetition_penalty": repetition_penalty,
@@ -1491,7 +1488,7 @@ class LLMHandler:
1491
  self,
1492
  query: str,
1493
  instrumental: bool = False,
1494
- vocal_language: Optional[List[str]] = None,
1495
  temperature: float = 0.85,
1496
  top_k: Optional[int] = None,
1497
  top_p: Optional[float] = None,
@@ -1509,8 +1506,8 @@ class LLMHandler:
1509
  Args:
1510
  query: User's natural language music description
1511
  instrumental: Whether to generate instrumental music (no vocals)
1512
- vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
1513
- If provided and not ["unknown"], the first language will be used.
1514
  temperature: Sampling temperature for generation (0.0-2.0)
1515
  top_k: Top-K sampling (None = disabled)
1516
  top_p: Top-P (nucleus) sampling (None = disabled)
@@ -1532,7 +1529,7 @@ class LLMHandler:
1532
 
1533
  Example:
1534
  query = "a soft Bengali love song for a quiet evening"
1535
- metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
1536
  print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
1537
  print(metadata['lyrics']) # "[Intro: ...]\\n..."
1538
  """
@@ -1540,7 +1537,7 @@ class LLMHandler:
1540
  return {}, "❌ 5Hz LM not initialized. Please initialize it first."
1541
 
1542
  if not query or not query.strip():
1543
- return {}, " No query provided. Please enter a music description."
1544
 
1545
  logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
1546
 
@@ -1554,14 +1551,11 @@ class LLMHandler:
1554
  # Build user_metadata if vocal_language is specified and is not "unknown"
1555
  user_metadata = None
1556
  skip_language = False
1557
- if vocal_language and len(vocal_language) > 0:
1558
- # Filter out "unknown" from the list
1559
- valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
1560
- if valid_languages:
1561
- # Use the first valid language for constrained decoding
1562
- user_metadata = {"language": valid_languages[0]}
1563
- skip_language = True # Skip language generation since we're injecting it
1564
- logger.info(f"Using user-specified language: {valid_languages[0]}")
1565
 
1566
  # Generate using constrained decoding (inspiration phase)
1567
  # Similar to understand mode - generate metadata first (CoT), then lyrics
@@ -1612,6 +1606,204 @@ class LLMHandler:
1612
  status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
1613
  return metadata, status_msg
1614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1615
  def generate_from_formatted_prompt(
1616
  self,
1617
  formatted_prompt: str,
 
19
  RepetitionPenaltyLogitsProcessor,
20
  )
21
  from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
22
+ from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION
23
 
24
 
25
  class LLMHandler:
 
1296
  self,
1297
  audio_codes: str,
1298
  temperature: float = 0.3,
 
 
1299
  top_k: Optional[int] = None,
1300
  top_p: Optional[float] = None,
1301
  repetition_penalty: float = 1.0,
 
1304
  ) -> Tuple[Dict[str, Any], str]:
1305
  """
1306
  Understand audio codes and generate metadata + lyrics.
1307
+
1308
  This is the reverse of the normal generation flow:
1309
  - Input: Audio codes
1310
  - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
1311
+
1312
+ Note: cfg_scale and negative_prompt are not supported in understand mode.
1313
+
1314
  Args:
1315
  audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
1316
  temperature: Sampling temperature for generation
 
 
1317
  top_k: Top-K sampling (None = disabled)
1318
  top_p: Top-P (nucleus) sampling (None = disabled)
1319
  repetition_penalty: Repetition penalty (1.0 = no penalty)
 
1350
  print(f"formatted_prompt: {formatted_prompt}")
1351
  # Generate using constrained decoding (understand phase)
1352
  # We want to generate metadata first (CoT), then lyrics (natural text)
1353
+ # Note: cfg_scale and negative_prompt are not used in understand mode
1354
  output_text, status = self.generate_from_formatted_prompt(
1355
  formatted_prompt=formatted_prompt,
1356
  cfg={
1357
  "temperature": temperature,
 
 
1358
  "top_k": top_k,
1359
  "top_p": top_p,
1360
  "repetition_penalty": repetition_penalty,
 
1488
  self,
1489
  query: str,
1490
  instrumental: bool = False,
1491
+ vocal_language: Optional[str] = None,
1492
  temperature: float = 0.85,
1493
  top_k: Optional[int] = None,
1494
  top_p: Optional[float] = None,
 
1506
  Args:
1507
  query: User's natural language music description
1508
  instrumental: Whether to generate instrumental music (no vocals)
1509
+ vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
1510
+ If provided and not "unknown", it will be used.
1511
  temperature: Sampling temperature for generation (0.0-2.0)
1512
  top_k: Top-K sampling (None = disabled)
1513
  top_p: Top-P (nucleus) sampling (None = disabled)
 
1529
 
1530
  Example:
1531
  query = "a soft Bengali love song for a quiet evening"
1532
+ metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language="bn")
1533
  print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
1534
  print(metadata['lyrics']) # "[Intro: ...]\\n..."
1535
  """
 
1537
  return {}, "❌ 5Hz LM not initialized. Please initialize it first."
1538
 
1539
  if not query or not query.strip():
1540
+ query = "NO USER INPUT"
1541
 
1542
  logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
1543
 
 
1551
  # Build user_metadata if vocal_language is specified and is not "unknown"
1552
  user_metadata = None
1553
  skip_language = False
1554
+ if vocal_language and vocal_language.strip() and vocal_language.strip().lower() != "unknown":
1555
+ # Use the specified language for constrained decoding
1556
+ user_metadata = {"language": vocal_language.strip()}
1557
+ skip_language = True # Skip language generation since we're injecting it
1558
+ logger.info(f"Using user-specified language: {vocal_language.strip()}")
 
 
 
1559
 
1560
  # Generate using constrained decoding (inspiration phase)
1561
  # Similar to understand mode - generate metadata first (CoT), then lyrics
 
1606
  status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
1607
  return metadata, status_msg
1608
 
1609
+ def build_formatted_prompt_for_format(
1610
+ self,
1611
+ caption: str,
1612
+ lyrics: str,
1613
+ is_negative_prompt: bool = False,
1614
+ negative_prompt: str = "NO USER INPUT"
1615
+ ) -> str:
1616
+ """
1617
+ Build the chat-formatted prompt for format/rewrite mode.
1618
+
1619
+ This formats user-provided caption and lyrics into a more detailed and specific
1620
+ musical description with metadata.
1621
+
1622
+ Args:
1623
+ caption: User's caption/description of the music
1624
+ lyrics: User's lyrics
1625
+ is_negative_prompt: If True, builds unconditional prompt for CFG
1626
+ negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
1627
+
1628
+ Returns:
1629
+ Formatted prompt string
1630
+
1631
+ Example:
1632
+ caption = "Latin pop, reggaeton, flamenco-pop"
1633
+ lyrics = "[Verse 1]\\nTengo un nudo..."
1634
+ prompt = handler.build_formatted_prompt_for_format(caption, lyrics)
1635
+ """
1636
+ if self.llm_tokenizer is None:
1637
+ raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
1638
+
1639
+ if is_negative_prompt:
1640
+ # For CFG unconditional prompt
1641
+ user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
1642
+ else:
1643
+ # Normal prompt: caption + lyrics
1644
+ user_content = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}"
1645
+
1646
+ return self.llm_tokenizer.apply_chat_template(
1647
+ [
1648
+ {
1649
+ "role": "system",
1650
+ "content": f"# Instruction\n{DEFAULT_LM_REWRITE_INSTRUCTION}\n\n"
1651
+ },
1652
+ {
1653
+ "role": "user",
1654
+ "content": user_content
1655
+ },
1656
+ ],
1657
+ tokenize=False,
1658
+ add_generation_prompt=True,
1659
+ )
1660
+
1661
+ def format_sample_from_input(
1662
+ self,
1663
+ caption: str,
1664
+ lyrics: str,
1665
+ user_metadata: Optional[Dict[str, Any]] = None,
1666
+ temperature: float = 0.85,
1667
+ top_k: Optional[int] = None,
1668
+ top_p: Optional[float] = None,
1669
+ repetition_penalty: float = 1.0,
1670
+ use_constrained_decoding: bool = True,
1671
+ constrained_decoding_debug: bool = False,
1672
+ ) -> Tuple[Dict[str, Any], str]:
1673
+ """
1674
+ Format user-provided caption and lyrics into structured music metadata.
1675
+
1676
+ This is the "Format" feature that takes user input and generates:
1677
+ - Enhanced caption with detailed music description
1678
+ - Metadata (bpm, duration, keyscale, language, timesignature)
1679
+ - Formatted lyrics (preserved from input)
1680
+
1681
+ Note: cfg_scale and negative_prompt are not supported in format mode.
1682
+
1683
+ Args:
1684
+ caption: User's caption/description (e.g., "Latin pop, reggaeton")
1685
+ lyrics: User's lyrics with structure tags
1686
+ user_metadata: Optional dict with user-provided metadata to constrain decoding.
1687
+ Supported keys: bpm, duration, keyscale, timesignature, language
1688
+ temperature: Sampling temperature for generation (0.0-2.0)
1689
+ top_k: Top-K sampling (None = disabled)
1690
+ top_p: Top-P (nucleus) sampling (None = disabled)
1691
+ repetition_penalty: Repetition penalty (1.0 = no penalty)
1692
+ use_constrained_decoding: Whether to use FSM-based constrained decoding
1693
+ constrained_decoding_debug: Whether to enable debug logging
1694
+
1695
+ Returns:
1696
+ Tuple of (metadata_dict, status_message)
1697
+ metadata_dict contains:
1698
+ - bpm: int or str
1699
+ - caption: str (enhanced)
1700
+ - duration: int or str
1701
+ - keyscale: str
1702
+ - language: str
1703
+ - timesignature: str
1704
+ - lyrics: str (from input, possibly formatted)
1705
+
1706
+ Example:
1707
+ caption = "Latin pop, reggaeton, flamenco-pop"
1708
+ lyrics = "[Verse 1]\\nTengo un nudo en la garganta..."
1709
+ metadata, status = handler.format_sample_from_input(caption, lyrics)
1710
+ print(metadata['caption']) # "A dramatic and powerful Latin pop track..."
1711
+ print(metadata['bpm']) # 100
1712
+ """
1713
+ if not getattr(self, "llm_initialized", False):
1714
+ return {}, "❌ 5Hz LM not initialized. Please initialize it first."
1715
+
1716
+ if not caption or not caption.strip():
1717
+ caption = "NO USER INPUT"
1718
+ if not lyrics or not lyrics.strip():
1719
+ lyrics = "[Instrumental]"
1720
+
1721
+ logger.info(f"Formatting sample from input: caption={caption[:50]}..., lyrics length={len(lyrics)}")
1722
+
1723
+ # Build formatted prompt for format task
1724
+ formatted_prompt = self.build_formatted_prompt_for_format(
1725
+ caption=caption,
1726
+ lyrics=lyrics,
1727
+ )
1728
+ logger.debug(f"Formatted prompt for format: {formatted_prompt}")
1729
+
1730
+ # Build constrained decoding metadata from user_metadata
1731
+ constrained_metadata = None
1732
+ if user_metadata:
1733
+ constrained_metadata = {}
1734
+ if user_metadata.get('bpm') is not None:
1735
+ try:
1736
+ bpm_val = int(user_metadata['bpm'])
1737
+ if bpm_val > 0:
1738
+ constrained_metadata['bpm'] = bpm_val
1739
+ except (ValueError, TypeError):
1740
+ pass
1741
+ if user_metadata.get('duration') is not None:
1742
+ try:
1743
+ dur_val = int(user_metadata['duration'])
1744
+ if dur_val > 0:
1745
+ constrained_metadata['duration'] = dur_val
1746
+ except (ValueError, TypeError):
1747
+ pass
1748
+ if user_metadata.get('keyscale'):
1749
+ constrained_metadata['keyscale'] = user_metadata['keyscale']
1750
+ if user_metadata.get('timesignature'):
1751
+ constrained_metadata['timesignature'] = user_metadata['timesignature']
1752
+ if user_metadata.get('language'):
1753
+ constrained_metadata['language'] = user_metadata['language']
1754
+
1755
+ # Only use if we have at least one field
1756
+ if not constrained_metadata:
1757
+ constrained_metadata = None
1758
+ else:
1759
+ logger.info(f"Using user-provided metadata constraints: {constrained_metadata}")
1760
+
1761
+ # Generate using constrained decoding (format phase)
1762
+ # Similar to understand/inspiration mode - generate metadata first (CoT), then formatted lyrics
1763
+ # Note: cfg_scale and negative_prompt are not used in format mode
1764
+ output_text, status = self.generate_from_formatted_prompt(
1765
+ formatted_prompt=formatted_prompt,
1766
+ cfg={
1767
+ "temperature": temperature,
1768
+ "top_k": top_k,
1769
+ "top_p": top_p,
1770
+ "repetition_penalty": repetition_penalty,
1771
+ "target_duration": None, # No duration constraint for generation length
1772
+ "user_metadata": constrained_metadata, # Inject user-provided metadata
1773
+ "skip_caption": False, # Generate caption
1774
+ "skip_language": constrained_metadata.get('language') is not None if constrained_metadata else False,
1775
+ "skip_genres": False, # Generate genres
1776
+ "generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
1777
+ "caption": "",
1778
+ "lyrics": "",
1779
+ },
1780
+ use_constrained_decoding=use_constrained_decoding,
1781
+ constrained_decoding_debug=constrained_decoding_debug,
1782
+ stop_at_reasoning=False, # Continue after </think> to get formatted lyrics
1783
+ )
1784
+
1785
+ if not output_text:
1786
+ return {}, status
1787
+
1788
+ # Parse metadata and extract lyrics
1789
+ metadata, _ = self.parse_lm_output(output_text)
1790
+
1791
+ # Extract formatted lyrics section (everything after </think>)
1792
+ formatted_lyrics = self._extract_lyrics_from_output(output_text)
1793
+ if formatted_lyrics:
1794
+ metadata['lyrics'] = formatted_lyrics
1795
+ else:
1796
+ # If no lyrics generated, keep original input
1797
+ metadata['lyrics'] = lyrics
1798
+
1799
+ logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
1800
+ if constrained_decoding_debug:
1801
+ logger.debug(f"Generated metadata: {list(metadata.keys())}")
1802
+ logger.debug(f"Output text preview: {output_text[:300]}...")
1803
+
1804
+ status_msg = f"✅ Format completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
1805
+ return metadata, status_msg
1806
+
1807
  def generate_from_formatted_prompt(
1808
  self,
1809
  formatted_prompt: str,
examples/simple_mode/example_01.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "a soft Bengali love song for a quiet evening",
3
  "instrumental": false,
4
- "vocal_language": ["bn"]
5
  }
 
1
  {
2
  "description": "a soft Bengali love song for a quiet evening",
3
  "instrumental": false,
4
+ "vocal_language": "bn"
5
  }
examples/simple_mode/example_02.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "an upbeat summer pop song with catchy hooks",
3
  "instrumental": false,
4
- "vocal_language": ["en"]
5
  }
 
1
  {
2
  "description": "an upbeat summer pop song with catchy hooks",
3
  "instrumental": false,
4
+ "vocal_language": "en"
5
  }
examples/simple_mode/example_03.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "epic orchestral cinematic music for a movie trailer",
3
  "instrumental": true,
4
- "vocal_language": ["unknown"]
5
  }
 
1
  {
2
  "description": "epic orchestral cinematic music for a movie trailer",
3
  "instrumental": true,
4
+ "vocal_language": "unknown"
5
  }
examples/simple_mode/example_04.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
3
  "instrumental": false,
4
- "vocal_language": ["zh"]
5
  }
 
1
  {
2
  "description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
3
  "instrumental": false,
4
+ "vocal_language": "zh"
5
  }
examples/simple_mode/example_05.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "Japanese city pop with nostalgic 80s vibes",
3
  "instrumental": false,
4
- "vocal_language": ["ja"]
5
  }
 
1
  {
2
  "description": "Japanese city pop with nostalgic 80s vibes",
3
  "instrumental": false,
4
+ "vocal_language": "ja"
5
  }
examples/simple_mode/example_06.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "lo-fi hip hop beats for studying and relaxing",
3
  "instrumental": true,
4
- "vocal_language": ["unknown"]
5
  }
 
1
  {
2
  "description": "lo-fi hip hop beats for studying and relaxing",
3
  "instrumental": true,
4
+ "vocal_language": "unknown"
5
  }
examples/simple_mode/example_07.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "energetic K-pop dance track with powerful vocals",
3
  "instrumental": false,
4
- "vocal_language": ["ko"]
5
  }
 
1
  {
2
  "description": "energetic K-pop dance track with powerful vocals",
3
  "instrumental": false,
4
+ "vocal_language": "ko"
5
  }
examples/simple_mode/example_08.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "romantic Spanish guitar ballad with heartfelt lyrics",
3
  "instrumental": false,
4
- "vocal_language": ["es"]
5
  }
 
1
  {
2
  "description": "romantic Spanish guitar ballad with heartfelt lyrics",
3
  "instrumental": false,
4
+ "vocal_language": "es"
5
  }
examples/simple_mode/example_09.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "中国风电子舞曲,融合古典乐器与现代节拍",
3
  "instrumental": false,
4
- "vocal_language": ["zh"]
5
  }
 
1
  {
2
  "description": "中国风电子舞曲,融合古典乐器与现代节拍",
3
  "instrumental": false,
4
+ "vocal_language": "zh"
5
  }
examples/simple_mode/example_10.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "description": "peaceful piano melody for meditation and relaxation",
3
  "instrumental": true,
4
- "vocal_language": ["unknown"]
5
  }
 
1
  {
2
  "description": "peaceful piano melody for meditation and relaxation",
3
  "instrumental": true,
4
+ "vocal_language": "unknown"
5
  }