Spaces:
Running
on
A100
Running
on
A100
test rewrite format
Browse files- .gitignore +2 -1
- acestep/gradio_ui/events/__init__.py +32 -0
- acestep/gradio_ui/events/generation_handlers.py +143 -40
- acestep/gradio_ui/events/results_handlers.py +13 -3
- acestep/gradio_ui/i18n/en.json +6 -2
- acestep/gradio_ui/i18n/ja.json +6 -2
- acestep/gradio_ui/i18n/zh.json +6 -2
- acestep/gradio_ui/interfaces/generation.py +37 -18
- acestep/inference.py +179 -19
- acestep/llm_inference.py +214 -22
- examples/simple_mode/example_01.json +1 -1
- examples/simple_mode/example_02.json +1 -1
- examples/simple_mode/example_03.json +1 -1
- examples/simple_mode/example_04.json +1 -1
- examples/simple_mode/example_05.json +1 -1
- examples/simple_mode/example_06.json +1 -1
- examples/simple_mode/example_07.json +1 -1
- examples/simple_mode/example_08.json +1 -1
- examples/simple_mode/example_09.json +1 -1
- examples/simple_mode/example_10.json +1 -1
.gitignore
CHANGED
|
@@ -220,4 +220,5 @@ discord_bot/
|
|
| 220 |
feishu_bot/
|
| 221 |
tmp*
|
| 222 |
torchinductor_root/
|
| 223 |
-
scripts/
|
|
|
|
|
|
| 220 |
feishu_bot/
|
| 221 |
tmp*
|
| 222 |
torchinductor_root/
|
| 223 |
+
scripts/
|
| 224 |
+
checkpoints_legacy/
|
acestep/gradio_ui/events/__init__.py
CHANGED
|
@@ -190,6 +190,37 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# ========== Simple/Custom Mode Toggle ==========
|
| 194 |
generation_section["generation_mode"].change(
|
| 195 |
fn=gen_h.handle_generation_mode_change,
|
|
@@ -245,6 +276,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 245 |
generation_section["audio_duration"],
|
| 246 |
generation_section["key_scale"],
|
| 247 |
generation_section["vocal_language"],
|
|
|
|
| 248 |
generation_section["time_signature"],
|
| 249 |
generation_section["instrumental_checkbox"],
|
| 250 |
generation_section["caption_accordion"],
|
|
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
| 193 |
+
# ========== Format Button ==========
|
| 194 |
+
# Note: cfg_scale and negative_prompt are not supported in format mode
|
| 195 |
+
generation_section["format_btn"].click(
|
| 196 |
+
fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
|
| 197 |
+
llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
|
| 198 |
+
),
|
| 199 |
+
inputs=[
|
| 200 |
+
generation_section["captions"],
|
| 201 |
+
generation_section["lyrics"],
|
| 202 |
+
generation_section["bpm"],
|
| 203 |
+
generation_section["audio_duration"],
|
| 204 |
+
generation_section["key_scale"],
|
| 205 |
+
generation_section["time_signature"],
|
| 206 |
+
generation_section["lm_temperature"],
|
| 207 |
+
generation_section["lm_top_k"],
|
| 208 |
+
generation_section["lm_top_p"],
|
| 209 |
+
generation_section["constrained_decoding_debug"],
|
| 210 |
+
],
|
| 211 |
+
outputs=[
|
| 212 |
+
generation_section["captions"],
|
| 213 |
+
generation_section["lyrics"],
|
| 214 |
+
generation_section["bpm"],
|
| 215 |
+
generation_section["audio_duration"],
|
| 216 |
+
generation_section["key_scale"],
|
| 217 |
+
generation_section["vocal_language"],
|
| 218 |
+
generation_section["time_signature"],
|
| 219 |
+
results_section["is_format_caption_state"],
|
| 220 |
+
results_section["status_output"],
|
| 221 |
+
]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
# ========== Simple/Custom Mode Toggle ==========
|
| 225 |
generation_section["generation_mode"].change(
|
| 226 |
fn=gen_h.handle_generation_mode_change,
|
|
|
|
| 276 |
generation_section["audio_duration"],
|
| 277 |
generation_section["key_scale"],
|
| 278 |
generation_section["vocal_language"],
|
| 279 |
+
generation_section["simple_vocal_language"],
|
| 280 |
generation_section["time_signature"],
|
| 281 |
generation_section["instrumental_checkbox"],
|
| 282 |
generation_section["caption_accordion"],
|
acestep/gradio_ui/events/generation_handlers.py
CHANGED
|
@@ -13,7 +13,7 @@ from acestep.constants import (
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
-
from acestep.inference import understand_music, create_sample
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
@@ -256,7 +256,7 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
|
|
| 256 |
|
| 257 |
def load_random_simple_description():
|
| 258 |
"""Load a random description from the simple_mode examples directory.
|
| 259 |
-
|
| 260 |
Returns:
|
| 261 |
Tuple of (description, instrumental, vocal_language) for updating UI components
|
| 262 |
"""
|
|
@@ -265,39 +265,39 @@ def load_random_simple_description():
|
|
| 265 |
current_file = os.path.abspath(__file__)
|
| 266 |
# This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
|
| 267 |
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
| 268 |
-
|
| 269 |
# Construct the examples directory path
|
| 270 |
examples_dir = os.path.join(project_root, "examples", "simple_mode")
|
| 271 |
-
|
| 272 |
# Check if directory exists
|
| 273 |
if not os.path.exists(examples_dir):
|
| 274 |
gr.Warning(t("messages.simple_examples_not_found"))
|
| 275 |
return gr.update(), gr.update(), gr.update()
|
| 276 |
-
|
| 277 |
# Find all JSON files in the directory
|
| 278 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 279 |
-
|
| 280 |
if not json_files:
|
| 281 |
gr.Warning(t("messages.simple_examples_empty"))
|
| 282 |
return gr.update(), gr.update(), gr.update()
|
| 283 |
-
|
| 284 |
# Randomly select one file
|
| 285 |
selected_file = random.choice(json_files)
|
| 286 |
-
|
| 287 |
# Read and parse JSON
|
| 288 |
try:
|
| 289 |
with open(selected_file, 'r', encoding='utf-8') as f:
|
| 290 |
data = json.load(f)
|
| 291 |
-
|
| 292 |
# Extract fields
|
| 293 |
description = data.get('description', '')
|
| 294 |
instrumental = data.get('instrumental', False)
|
| 295 |
-
vocal_language = data.get('vocal_language',
|
| 296 |
-
|
| 297 |
-
# Ensure vocal_language is a
|
| 298 |
-
if isinstance(vocal_language,
|
| 299 |
-
vocal_language = [vocal_language
|
| 300 |
-
|
| 301 |
gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
|
| 302 |
return description, instrumental, vocal_language
|
| 303 |
|
|
@@ -564,7 +564,7 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
|
|
| 564 |
def handle_simple_instrumental_change(is_instrumental: bool):
|
| 565 |
"""
|
| 566 |
Handle simple mode instrumental checkbox changes.
|
| 567 |
-
When checked: set vocal_language to
|
| 568 |
When unchecked: enable vocal_language editing.
|
| 569 |
|
| 570 |
Args:
|
|
@@ -574,7 +574,7 @@ def handle_simple_instrumental_change(is_instrumental: bool):
|
|
| 574 |
gr.update for simple_vocal_language dropdown
|
| 575 |
"""
|
| 576 |
if is_instrumental:
|
| 577 |
-
return gr.update(value=
|
| 578 |
else:
|
| 579 |
return gr.update(interactive=True)
|
| 580 |
|
|
@@ -653,7 +653,7 @@ def handle_create_sample(
|
|
| 653 |
llm_handler,
|
| 654 |
query: str,
|
| 655 |
instrumental: bool,
|
| 656 |
-
vocal_language:
|
| 657 |
lm_temperature: float,
|
| 658 |
lm_top_k: int,
|
| 659 |
lm_top_p: float,
|
|
@@ -671,7 +671,7 @@ def handle_create_sample(
|
|
| 671 |
llm_handler: LLM handler instance
|
| 672 |
query: User's natural language music description
|
| 673 |
instrumental: Whether to generate instrumental music
|
| 674 |
-
vocal_language:
|
| 675 |
lm_temperature: LLM temperature for generation
|
| 676 |
lm_top_k: LLM top-k sampling
|
| 677 |
lm_top_p: LLM top-p sampling
|
|
@@ -695,27 +695,6 @@ def handle_create_sample(
|
|
| 695 |
- is_format_caption_state (True)
|
| 696 |
- status_output
|
| 697 |
"""
|
| 698 |
-
# Validate query
|
| 699 |
-
if not query or not query.strip():
|
| 700 |
-
gr.Warning(t("messages.empty_query"))
|
| 701 |
-
return (
|
| 702 |
-
gr.update(), # captions - no change
|
| 703 |
-
gr.update(), # lyrics - no change
|
| 704 |
-
gr.update(), # bpm - no change
|
| 705 |
-
gr.update(), # audio_duration - no change
|
| 706 |
-
gr.update(), # key_scale - no change
|
| 707 |
-
gr.update(), # vocal_language - no change
|
| 708 |
-
gr.update(), # time_signature - no change
|
| 709 |
-
gr.update(), # instrumental_checkbox - no change
|
| 710 |
-
gr.update(), # caption_accordion - no change
|
| 711 |
-
gr.update(), # lyrics_accordion - no change
|
| 712 |
-
gr.update(interactive=False), # generate_btn - keep disabled
|
| 713 |
-
False, # simple_sample_created - still False
|
| 714 |
-
gr.update(), # think_checkbox - no change
|
| 715 |
-
gr.update(), # is_format_caption_state - no change
|
| 716 |
-
t("messages.empty_query"), # status_output
|
| 717 |
-
)
|
| 718 |
-
|
| 719 |
# Check if LLM is initialized
|
| 720 |
if not llm_handler.llm_initialized:
|
| 721 |
gr.Warning(t("messages.lm_not_initialized"))
|
|
@@ -765,6 +744,7 @@ def handle_create_sample(
|
|
| 765 |
gr.update(), # audio_duration - no change
|
| 766 |
gr.update(), # key_scale - no change
|
| 767 |
gr.update(), # vocal_language - no change
|
|
|
|
| 768 |
gr.update(), # time_signature - no change
|
| 769 |
gr.update(), # instrumental_checkbox - no change
|
| 770 |
gr.update(), # caption_accordion - no change
|
|
@@ -786,6 +766,7 @@ def handle_create_sample(
|
|
| 786 |
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 787 |
result.keyscale, # key_scale
|
| 788 |
result.language, # vocal_language
|
|
|
|
| 789 |
result.timesignature, # time_signature
|
| 790 |
result.instrumental, # instrumental_checkbox
|
| 791 |
gr.update(open=True), # caption_accordion - expand
|
|
@@ -798,3 +779,125 @@ def handle_create_sample(
|
|
| 798 |
)
|
| 799 |
|
| 800 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
+
from acestep.inference import understand_music, create_sample, format_sample
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
|
|
| 256 |
|
| 257 |
def load_random_simple_description():
|
| 258 |
"""Load a random description from the simple_mode examples directory.
|
| 259 |
+
|
| 260 |
Returns:
|
| 261 |
Tuple of (description, instrumental, vocal_language) for updating UI components
|
| 262 |
"""
|
|
|
|
| 265 |
current_file = os.path.abspath(__file__)
|
| 266 |
# This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
|
| 267 |
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
| 268 |
+
|
| 269 |
# Construct the examples directory path
|
| 270 |
examples_dir = os.path.join(project_root, "examples", "simple_mode")
|
| 271 |
+
|
| 272 |
# Check if directory exists
|
| 273 |
if not os.path.exists(examples_dir):
|
| 274 |
gr.Warning(t("messages.simple_examples_not_found"))
|
| 275 |
return gr.update(), gr.update(), gr.update()
|
| 276 |
+
|
| 277 |
# Find all JSON files in the directory
|
| 278 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 279 |
+
|
| 280 |
if not json_files:
|
| 281 |
gr.Warning(t("messages.simple_examples_empty"))
|
| 282 |
return gr.update(), gr.update(), gr.update()
|
| 283 |
+
|
| 284 |
# Randomly select one file
|
| 285 |
selected_file = random.choice(json_files)
|
| 286 |
+
|
| 287 |
# Read and parse JSON
|
| 288 |
try:
|
| 289 |
with open(selected_file, 'r', encoding='utf-8') as f:
|
| 290 |
data = json.load(f)
|
| 291 |
+
|
| 292 |
# Extract fields
|
| 293 |
description = data.get('description', '')
|
| 294 |
instrumental = data.get('instrumental', False)
|
| 295 |
+
vocal_language = data.get('vocal_language', 'unknown')
|
| 296 |
+
|
| 297 |
+
# Ensure vocal_language is a string
|
| 298 |
+
if isinstance(vocal_language, list):
|
| 299 |
+
vocal_language = vocal_language[0] if vocal_language else 'unknown'
|
| 300 |
+
|
| 301 |
gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
|
| 302 |
return description, instrumental, vocal_language
|
| 303 |
|
|
|
|
| 564 |
def handle_simple_instrumental_change(is_instrumental: bool):
|
| 565 |
"""
|
| 566 |
Handle simple mode instrumental checkbox changes.
|
| 567 |
+
When checked: set vocal_language to "unknown" and disable editing.
|
| 568 |
When unchecked: enable vocal_language editing.
|
| 569 |
|
| 570 |
Args:
|
|
|
|
| 574 |
gr.update for simple_vocal_language dropdown
|
| 575 |
"""
|
| 576 |
if is_instrumental:
|
| 577 |
+
return gr.update(value="unknown", interactive=False)
|
| 578 |
else:
|
| 579 |
return gr.update(interactive=True)
|
| 580 |
|
|
|
|
| 653 |
llm_handler,
|
| 654 |
query: str,
|
| 655 |
instrumental: bool,
|
| 656 |
+
vocal_language: str,
|
| 657 |
lm_temperature: float,
|
| 658 |
lm_top_k: int,
|
| 659 |
lm_top_p: float,
|
|
|
|
| 671 |
llm_handler: LLM handler instance
|
| 672 |
query: User's natural language music description
|
| 673 |
instrumental: Whether to generate instrumental music
|
| 674 |
+
vocal_language: Preferred vocal language for constrained decoding
|
| 675 |
lm_temperature: LLM temperature for generation
|
| 676 |
lm_top_k: LLM top-k sampling
|
| 677 |
lm_top_p: LLM top-p sampling
|
|
|
|
| 695 |
- is_format_caption_state (True)
|
| 696 |
- status_output
|
| 697 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
# Check if LLM is initialized
|
| 699 |
if not llm_handler.llm_initialized:
|
| 700 |
gr.Warning(t("messages.lm_not_initialized"))
|
|
|
|
| 744 |
gr.update(), # audio_duration - no change
|
| 745 |
gr.update(), # key_scale - no change
|
| 746 |
gr.update(), # vocal_language - no change
|
| 747 |
+
gr.update(), # simple vocal_language - no change
|
| 748 |
gr.update(), # time_signature - no change
|
| 749 |
gr.update(), # instrumental_checkbox - no change
|
| 750 |
gr.update(), # caption_accordion - no change
|
|
|
|
| 766 |
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 767 |
result.keyscale, # key_scale
|
| 768 |
result.language, # vocal_language
|
| 769 |
+
result.language, # simple vocal_language
|
| 770 |
result.timesignature, # time_signature
|
| 771 |
result.instrumental, # instrumental_checkbox
|
| 772 |
gr.update(open=True), # caption_accordion - expand
|
|
|
|
| 779 |
)
|
| 780 |
|
| 781 |
|
| 782 |
+
def handle_format_sample(
|
| 783 |
+
llm_handler,
|
| 784 |
+
caption: str,
|
| 785 |
+
lyrics: str,
|
| 786 |
+
bpm,
|
| 787 |
+
audio_duration,
|
| 788 |
+
key_scale: str,
|
| 789 |
+
time_signature: str,
|
| 790 |
+
lm_temperature: float,
|
| 791 |
+
lm_top_k: int,
|
| 792 |
+
lm_top_p: float,
|
| 793 |
+
constrained_decoding_debug: bool = False,
|
| 794 |
+
):
|
| 795 |
+
"""
|
| 796 |
+
Handle the Format button click to format caption and lyrics.
|
| 797 |
+
|
| 798 |
+
Takes user-provided caption and lyrics, and uses the LLM to generate
|
| 799 |
+
structured music metadata and an enhanced description.
|
| 800 |
+
|
| 801 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 802 |
+
|
| 803 |
+
Args:
|
| 804 |
+
llm_handler: LLM handler instance
|
| 805 |
+
caption: User's caption/description
|
| 806 |
+
lyrics: User's lyrics
|
| 807 |
+
bpm: User-provided BPM (optional, for constrained decoding)
|
| 808 |
+
audio_duration: User-provided duration (optional, for constrained decoding)
|
| 809 |
+
key_scale: User-provided key scale (optional, for constrained decoding)
|
| 810 |
+
time_signature: User-provided time signature (optional, for constrained decoding)
|
| 811 |
+
lm_temperature: LLM temperature for generation
|
| 812 |
+
lm_top_k: LLM top-k sampling
|
| 813 |
+
lm_top_p: LLM top-p sampling
|
| 814 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 815 |
+
|
| 816 |
+
Returns:
|
| 817 |
+
Tuple of updates for:
|
| 818 |
+
- captions
|
| 819 |
+
- lyrics
|
| 820 |
+
- bpm
|
| 821 |
+
- audio_duration
|
| 822 |
+
- key_scale
|
| 823 |
+
- vocal_language
|
| 824 |
+
- time_signature
|
| 825 |
+
- is_format_caption_state
|
| 826 |
+
- status_output
|
| 827 |
+
"""
|
| 828 |
+
# Check if LLM is initialized
|
| 829 |
+
if not llm_handler.llm_initialized:
|
| 830 |
+
gr.Warning(t("messages.lm_not_initialized"))
|
| 831 |
+
return (
|
| 832 |
+
gr.update(), # captions - no change
|
| 833 |
+
gr.update(), # lyrics - no change
|
| 834 |
+
gr.update(), # bpm - no change
|
| 835 |
+
gr.update(), # audio_duration - no change
|
| 836 |
+
gr.update(), # key_scale - no change
|
| 837 |
+
gr.update(), # vocal_language - no change
|
| 838 |
+
gr.update(), # time_signature - no change
|
| 839 |
+
gr.update(), # is_format_caption_state - no change
|
| 840 |
+
t("messages.lm_not_initialized"), # status_output
|
| 841 |
+
)
|
| 842 |
+
|
| 843 |
+
# Build user_metadata from provided values for constrained decoding
|
| 844 |
+
user_metadata = {}
|
| 845 |
+
if bpm is not None and bpm > 0:
|
| 846 |
+
user_metadata['bpm'] = int(bpm)
|
| 847 |
+
if audio_duration is not None and audio_duration > 0:
|
| 848 |
+
user_metadata['duration'] = int(audio_duration)
|
| 849 |
+
if key_scale and key_scale.strip():
|
| 850 |
+
user_metadata['keyscale'] = key_scale.strip()
|
| 851 |
+
if time_signature and time_signature.strip():
|
| 852 |
+
user_metadata['timesignature'] = time_signature.strip()
|
| 853 |
+
|
| 854 |
+
# Only pass user_metadata if we have at least one field
|
| 855 |
+
user_metadata_to_pass = user_metadata if user_metadata else None
|
| 856 |
+
|
| 857 |
+
# Convert LM parameters
|
| 858 |
+
top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
|
| 859 |
+
top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
|
| 860 |
+
|
| 861 |
+
# Call format_sample API
|
| 862 |
+
result = format_sample(
|
| 863 |
+
llm_handler=llm_handler,
|
| 864 |
+
caption=caption,
|
| 865 |
+
lyrics=lyrics,
|
| 866 |
+
user_metadata=user_metadata_to_pass,
|
| 867 |
+
temperature=lm_temperature,
|
| 868 |
+
top_k=top_k_value,
|
| 869 |
+
top_p=top_p_value,
|
| 870 |
+
use_constrained_decoding=True,
|
| 871 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 872 |
+
)
|
| 873 |
+
|
| 874 |
+
# Handle error
|
| 875 |
+
if not result.success:
|
| 876 |
+
gr.Warning(result.status_message or t("messages.format_failed"))
|
| 877 |
+
return (
|
| 878 |
+
gr.update(), # captions - no change
|
| 879 |
+
gr.update(), # lyrics - no change
|
| 880 |
+
gr.update(), # bpm - no change
|
| 881 |
+
gr.update(), # audio_duration - no change
|
| 882 |
+
gr.update(), # key_scale - no change
|
| 883 |
+
gr.update(), # vocal_language - no change
|
| 884 |
+
gr.update(), # time_signature - no change
|
| 885 |
+
gr.update(), # is_format_caption_state - no change
|
| 886 |
+
result.status_message or t("messages.format_failed"), # status_output
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
# Success - populate fields
|
| 890 |
+
gr.Info(t("messages.format_success"))
|
| 891 |
+
|
| 892 |
+
return (
|
| 893 |
+
result.caption, # captions
|
| 894 |
+
result.lyrics, # lyrics
|
| 895 |
+
result.bpm, # bpm
|
| 896 |
+
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 897 |
+
result.keyscale, # key_scale
|
| 898 |
+
result.language, # vocal_language
|
| 899 |
+
result.timesignature, # time_signature
|
| 900 |
+
True, # is_format_caption_state - True (LM-formatted)
|
| 901 |
+
result.status_message, # status_output
|
| 902 |
+
)
|
| 903 |
+
|
acestep/gradio_ui/events/results_handlers.py
CHANGED
|
@@ -465,6 +465,14 @@ def generate_with_progress(
|
|
| 465 |
):
|
| 466 |
"""Generate audio with progress tracking"""
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
# step 1: prepare inputs
|
| 469 |
# generate_music, GenerationParams, GenerationConfig
|
| 470 |
gen_params = GenerationParams(
|
|
@@ -496,7 +504,7 @@ def generate_with_progress(
|
|
| 496 |
lm_top_k=lm_top_k,
|
| 497 |
lm_top_p=lm_top_p,
|
| 498 |
lm_negative_prompt=lm_negative_prompt,
|
| 499 |
-
use_cot_metas=
|
| 500 |
use_cot_caption=use_cot_caption,
|
| 501 |
use_cot_language=use_cot_language,
|
| 502 |
use_constrained_decoding=True,
|
|
@@ -587,7 +595,7 @@ def generate_with_progress(
|
|
| 587 |
# Clear lrc_display with empty string - this triggers .change() to clear subtitles
|
| 588 |
clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
|
| 589 |
clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
|
| 590 |
-
dump_audio = [
|
| 591 |
yield (
|
| 592 |
# Audio outputs - just skip, value will be updated in loop
|
| 593 |
# Subtitles will be cleared via lrc_display.change()
|
|
@@ -1682,6 +1690,8 @@ def generate_next_batch_background(
|
|
| 1682 |
|
| 1683 |
# Call generate_with_progress with the saved parameters
|
| 1684 |
# Note: generate_with_progress is a generator, need to iterate through it
|
|
|
|
|
|
|
| 1685 |
generator = generate_with_progress(
|
| 1686 |
dit_handler,
|
| 1687 |
llm_handler,
|
|
@@ -1719,7 +1729,7 @@ def generate_next_batch_background(
|
|
| 1719 |
use_cot_metas=params.get("use_cot_metas"),
|
| 1720 |
use_cot_caption=params.get("use_cot_caption"),
|
| 1721 |
use_cot_language=params.get("use_cot_language"),
|
| 1722 |
-
is_format_caption=is_format_caption,
|
| 1723 |
constrained_decoding_debug=params.get("constrained_decoding_debug"),
|
| 1724 |
allow_lm_batch=params.get("allow_lm_batch"),
|
| 1725 |
auto_score=params.get("auto_score"),
|
|
|
|
| 465 |
):
|
| 466 |
"""Generate audio with progress tracking"""
|
| 467 |
|
| 468 |
+
# Skip Phase 1 metas COT if sample is already formatted (from LLM/file/random)
|
| 469 |
+
# This avoids redundant LLM calls since metas (bpm, keyscale, etc.) are already generated
|
| 470 |
+
actual_use_cot_metas = use_cot_metas
|
| 471 |
+
if is_format_caption and use_cot_metas:
|
| 472 |
+
actual_use_cot_metas = False
|
| 473 |
+
logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
|
| 474 |
+
gr.Info(t("messages.skipping_metas_cot"))
|
| 475 |
+
|
| 476 |
# step 1: prepare inputs
|
| 477 |
# generate_music, GenerationParams, GenerationConfig
|
| 478 |
gen_params = GenerationParams(
|
|
|
|
| 504 |
lm_top_k=lm_top_k,
|
| 505 |
lm_top_p=lm_top_p,
|
| 506 |
lm_negative_prompt=lm_negative_prompt,
|
| 507 |
+
use_cot_metas=actual_use_cot_metas,
|
| 508 |
use_cot_caption=use_cot_caption,
|
| 509 |
use_cot_language=use_cot_language,
|
| 510 |
use_constrained_decoding=True,
|
|
|
|
| 595 |
# Clear lrc_display with empty string - this triggers .change() to clear subtitles
|
| 596 |
clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
|
| 597 |
clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
|
| 598 |
+
dump_audio = [gr.update(value="", subtitles="") for _ in range(8)]
|
| 599 |
yield (
|
| 600 |
# Audio outputs - just skip, value will be updated in loop
|
| 601 |
# Subtitles will be cleared via lrc_display.change()
|
|
|
|
| 1690 |
|
| 1691 |
# Call generate_with_progress with the saved parameters
|
| 1692 |
# Note: generate_with_progress is a generator, need to iterate through it
|
| 1693 |
+
# For AutoGen background batches, always skip metas COT since we want to
|
| 1694 |
+
# generate NEW audio codes with new seeds, not regenerate the same metas
|
| 1695 |
generator = generate_with_progress(
|
| 1696 |
dit_handler,
|
| 1697 |
llm_handler,
|
|
|
|
| 1729 |
use_cot_metas=params.get("use_cot_metas"),
|
| 1730 |
use_cot_caption=params.get("use_cot_caption"),
|
| 1731 |
use_cot_language=params.get("use_cot_language"),
|
| 1732 |
+
is_format_caption=is_format_caption, # Pass through - will skip metas COT if True
|
| 1733 |
constrained_decoding_debug=params.get("constrained_decoding_debug"),
|
| 1734 |
allow_lm_batch=params.get("allow_lm_batch"),
|
| 1735 |
auto_score=params.get("auto_score"),
|
acestep/gradio_ui/i18n/en.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "Simple",
|
| 85 |
"mode_custom": "Custom",
|
| 86 |
"simple_query_label": "Song Description",
|
| 87 |
-
"simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
|
| 88 |
"simple_query_info": "Enter a natural language description of the music you want to generate",
|
| 89 |
"simple_vocal_language_label": "Vocal Language (optional)",
|
| 90 |
"simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
| 99 |
"lyrics_info": "Song lyrics with structure",
|
| 100 |
"instrumental_label": "Instrumental",
|
|
|
|
| 101 |
"optional_params": "⚙️ Optional Parameters",
|
| 102 |
"vocal_language_label": "Vocal Language (optional)",
|
| 103 |
"vocal_language_info": "use `unknown` for inst",
|
|
@@ -227,6 +228,9 @@
|
|
| 227 |
"sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
|
| 228 |
"simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
|
| 229 |
"simple_examples_empty": "⚠️ No example files found in simple mode examples.",
|
| 230 |
-
"simple_example_loaded": "🎲 Loaded random example from {filename}"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "Simple",
|
| 85 |
"mode_custom": "Custom",
|
| 86 |
"simple_query_label": "Song Description",
|
| 87 |
+
"simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
|
| 88 |
"simple_query_info": "Enter a natural language description of the music you want to generate",
|
| 89 |
"simple_vocal_language_label": "Vocal Language (optional)",
|
| 90 |
"simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
|
|
|
|
| 98 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
| 99 |
"lyrics_info": "Song lyrics with structure",
|
| 100 |
"instrumental_label": "Instrumental",
|
| 101 |
+
"format_btn": "Format",
|
| 102 |
"optional_params": "⚙️ Optional Parameters",
|
| 103 |
"vocal_language_label": "Vocal Language (optional)",
|
| 104 |
"vocal_language_info": "use `unknown` for inst",
|
|
|
|
| 228 |
"sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
|
| 229 |
"simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
|
| 230 |
"simple_examples_empty": "⚠️ No example files found in simple mode examples.",
|
| 231 |
+
"simple_example_loaded": "🎲 Loaded random example from {filename}",
|
| 232 |
+
"format_success": "✅ Caption and lyrics formatted successfully",
|
| 233 |
+
"format_failed": "❌ Format failed: {error}",
|
| 234 |
+
"skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
|
| 235 |
}
|
| 236 |
}
|
acestep/gradio_ui/i18n/ja.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "シンプル",
|
| 85 |
"mode_custom": "カスタム",
|
| 86 |
"simple_query_label": "曲の説明",
|
| 87 |
-
"simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'",
|
| 88 |
"simple_query_info": "生成したい音楽の自然言語の説明を入力",
|
| 89 |
"simple_vocal_language_label": "ボーカル言語(オプション)",
|
| 90 |
"simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
| 99 |
"lyrics_info": "構造を持つ曲の歌詞",
|
| 100 |
"instrumental_label": "インストゥルメンタル",
|
|
|
|
| 101 |
"optional_params": "⚙️ オプションパラメータ",
|
| 102 |
"vocal_language_label": "ボーカル言語(オプション)",
|
| 103 |
"vocal_language_info": "インストには`unknown`を使用",
|
|
@@ -227,6 +228,9 @@
|
|
| 227 |
"sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
|
| 228 |
"simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
|
| 229 |
"simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
|
| 230 |
-
"simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "シンプル",
|
| 85 |
"mode_custom": "カスタム",
|
| 86 |
"simple_query_label": "曲の説明",
|
| 87 |
+
"simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
|
| 88 |
"simple_query_info": "生成したい音楽の自然言語の説明を入力",
|
| 89 |
"simple_vocal_language_label": "ボーカル言語(オプション)",
|
| 90 |
"simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
|
|
|
|
| 98 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
| 99 |
"lyrics_info": "構造を持つ曲の歌詞",
|
| 100 |
"instrumental_label": "インストゥルメンタル",
|
| 101 |
+
"format_btn": "フォーマット",
|
| 102 |
"optional_params": "⚙️ オプションパラメータ",
|
| 103 |
"vocal_language_label": "ボーカル言語(オプション)",
|
| 104 |
"vocal_language_info": "インストには`unknown`を使用",
|
|
|
|
| 228 |
"sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
|
| 229 |
"simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
|
| 230 |
"simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
|
| 231 |
+
"simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
|
| 232 |
+
"format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
|
| 233 |
+
"format_failed": "❌ フォーマットに失敗しました: {error}",
|
| 234 |
+
"skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ(サンプルは既にフォーマット済み)"
|
| 235 |
}
|
| 236 |
}
|
acestep/gradio_ui/i18n/zh.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "简单",
|
| 85 |
"mode_custom": "自定义",
|
| 86 |
"simple_query_label": "歌曲描述",
|
| 87 |
-
"simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'",
|
| 88 |
"simple_query_info": "输入你想生成的音乐的自然语言描述",
|
| 89 |
"simple_vocal_language_label": "人声语言(可选)",
|
| 90 |
"simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
| 99 |
"lyrics_info": "带有结构的歌曲歌词",
|
| 100 |
"instrumental_label": "纯音乐",
|
|
|
|
| 101 |
"optional_params": "⚙️ 可选参数",
|
| 102 |
"vocal_language_label": "人声语言(可选)",
|
| 103 |
"vocal_language_info": "纯音乐使用 `unknown`",
|
|
@@ -227,6 +228,9 @@
|
|
| 227 |
"sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
|
| 228 |
"simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
|
| 229 |
"simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
|
| 230 |
-
"simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "简单",
|
| 85 |
"mode_custom": "自定义",
|
| 86 |
"simple_query_label": "歌曲描述",
|
| 87 |
+
"simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'。留空则随机生成样本。",
|
| 88 |
"simple_query_info": "输入你想生成的音乐的自然语言描述",
|
| 89 |
"simple_vocal_language_label": "人声语言(可选)",
|
| 90 |
"simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
|
|
|
|
| 98 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
| 99 |
"lyrics_info": "带有结构的歌曲歌词",
|
| 100 |
"instrumental_label": "纯音乐",
|
| 101 |
+
"format_btn": "格式化",
|
| 102 |
"optional_params": "⚙️ 可选参数",
|
| 103 |
"vocal_language_label": "人声语言(可选)",
|
| 104 |
"vocal_language_info": "纯音乐使用 `unknown`",
|
|
|
|
| 228 |
"sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
|
| 229 |
"simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
|
| 230 |
"simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
|
| 231 |
+
"simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
|
| 232 |
+
"format_success": "✅ 描述和歌词格式化成功",
|
| 233 |
+
"format_failed": "❌ 格式化失败: {error}",
|
| 234 |
+
"skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT(样本已格式化)"
|
| 235 |
}
|
| 236 |
}
|
acestep/gradio_ui/interfaces/generation.py
CHANGED
|
@@ -314,15 +314,15 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 314 |
placeholder=t("generation.caption_placeholder"),
|
| 315 |
lines=3,
|
| 316 |
info=t("generation.caption_info"),
|
| 317 |
-
scale=
|
| 318 |
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
# Lyrics - wrapped in accordion that can be collapsed in Simple mode
|
| 327 |
with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
|
| 328 |
lyrics = gr.Textbox(
|
|
@@ -331,22 +331,40 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 331 |
lines=8,
|
| 332 |
info=t("generation.lyrics_info")
|
| 333 |
)
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
| 343 |
vocal_language = gr.Dropdown(
|
| 344 |
choices=VALID_LANGUAGES,
|
| 345 |
value="unknown",
|
| 346 |
label=t("generation.vocal_language_label"),
|
|
|
|
|
|
|
| 347 |
allow_custom_value=True,
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
bpm = gr.Number(
|
| 351 |
label=t("generation.bpm_label"),
|
| 352 |
value=None,
|
|
@@ -679,6 +697,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 679 |
"autogen_checkbox": autogen_checkbox,
|
| 680 |
"generate_btn": generate_btn,
|
| 681 |
"instrumental_checkbox": instrumental_checkbox,
|
|
|
|
| 682 |
"constrained_decoding_debug": constrained_decoding_debug,
|
| 683 |
"score_scale": score_scale,
|
| 684 |
"allow_lm_batch": allow_lm_batch,
|
|
|
|
| 314 |
placeholder=t("generation.caption_placeholder"),
|
| 315 |
lines=3,
|
| 316 |
info=t("generation.caption_info"),
|
| 317 |
+
scale=12,
|
| 318 |
)
|
| 319 |
+
with gr.Column(scale=1, min_width=100):
|
| 320 |
+
sample_btn = gr.Button(
|
| 321 |
+
"🎲",
|
| 322 |
+
variant="secondary",
|
| 323 |
+
size="sm",
|
| 324 |
+
scale=2,
|
| 325 |
+
)
|
| 326 |
# Lyrics - wrapped in accordion that can be collapsed in Simple mode
|
| 327 |
with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
|
| 328 |
lyrics = gr.Textbox(
|
|
|
|
| 331 |
lines=8,
|
| 332 |
info=t("generation.lyrics_info")
|
| 333 |
)
|
| 334 |
+
|
| 335 |
+
with gr.Row(variant="compact", equal_height=True):
|
| 336 |
+
instrumental_checkbox = gr.Checkbox(
|
| 337 |
+
label=t("generation.instrumental_label"),
|
| 338 |
+
value=False,
|
| 339 |
+
scale=1,
|
| 340 |
+
min_width=120,
|
| 341 |
+
container=True,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# 中间:语言选择 (Dropdown)
|
| 345 |
+
# 移除 gr.HTML hack,直接使用 label 参数,Gradio 会自动处理对齐
|
| 346 |
vocal_language = gr.Dropdown(
|
| 347 |
choices=VALID_LANGUAGES,
|
| 348 |
value="unknown",
|
| 349 |
label=t("generation.vocal_language_label"),
|
| 350 |
+
show_label=False,
|
| 351 |
+
container=True,
|
| 352 |
allow_custom_value=True,
|
| 353 |
+
scale=3,
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# 右侧:格式化按钮 (Button)
|
| 357 |
+
# 放在同一行最右侧,操作更顺手
|
| 358 |
+
format_btn = gr.Button(
|
| 359 |
+
t("generation.format_btn"),
|
| 360 |
+
variant="secondary",
|
| 361 |
+
scale=1,
|
| 362 |
+
min_width=80,
|
| 363 |
)
|
| 364 |
+
|
| 365 |
+
# Optional Parameters
|
| 366 |
+
with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
|
| 367 |
+
with gr.Row():
|
| 368 |
bpm = gr.Number(
|
| 369 |
label=t("generation.bpm_label"),
|
| 370 |
value=None,
|
|
|
|
| 697 |
"autogen_checkbox": autogen_checkbox,
|
| 698 |
"generate_btn": generate_btn,
|
| 699 |
"instrumental_checkbox": instrumental_checkbox,
|
| 700 |
+
"format_btn": format_btn,
|
| 701 |
"constrained_decoding_debug": constrained_decoding_debug,
|
| 702 |
"score_scale": score_scale,
|
| 703 |
"allow_lm_batch": allow_lm_batch,
|
acestep/inference.py
CHANGED
|
@@ -671,8 +671,6 @@ def understand_music(
|
|
| 671 |
llm_handler,
|
| 672 |
audio_codes: str,
|
| 673 |
temperature: float = 0.85,
|
| 674 |
-
cfg_scale: float = 1.0,
|
| 675 |
-
negative_prompt: str = "NO USER INPUT",
|
| 676 |
top_k: Optional[int] = None,
|
| 677 |
top_p: Optional[float] = None,
|
| 678 |
repetition_penalty: float = 1.0,
|
|
@@ -687,13 +685,13 @@ def understand_music(
|
|
| 687 |
If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
|
| 688 |
instead of analyzing existing codes.
|
| 689 |
|
|
|
|
|
|
|
| 690 |
Args:
|
| 691 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 692 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 693 |
Use empty string or "NO USER INPUT" to generate a sample example.
|
| 694 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 695 |
-
cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
|
| 696 |
-
negative_prompt: Negative prompt for CFG guidance
|
| 697 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 698 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 699 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
@@ -727,8 +725,6 @@ def understand_music(
|
|
| 727 |
metadata, status = llm_handler.understand_audio_from_codes(
|
| 728 |
audio_codes=audio_codes,
|
| 729 |
temperature=temperature,
|
| 730 |
-
cfg_scale=cfg_scale,
|
| 731 |
-
negative_prompt=negative_prompt,
|
| 732 |
top_k=top_k,
|
| 733 |
top_p=top_p,
|
| 734 |
repetition_penalty=repetition_penalty,
|
|
@@ -847,7 +843,7 @@ def create_sample(
|
|
| 847 |
llm_handler,
|
| 848 |
query: str,
|
| 849 |
instrumental: bool = False,
|
| 850 |
-
vocal_language: Optional[
|
| 851 |
temperature: float = 0.85,
|
| 852 |
top_k: Optional[int] = None,
|
| 853 |
top_p: Optional[float] = None,
|
|
@@ -869,9 +865,9 @@ def create_sample(
|
|
| 869 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 870 |
query: User's natural language music description (e.g., "a soft Bengali love song")
|
| 871 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 872 |
-
vocal_language:
|
| 873 |
-
If provided, the model will be constrained to generate lyrics in
|
| 874 |
-
If None or
|
| 875 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 876 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 877 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
|
@@ -883,7 +879,7 @@ def create_sample(
|
|
| 883 |
CreateSampleResult with generated sample fields and status
|
| 884 |
|
| 885 |
Example:
|
| 886 |
-
>>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=
|
| 887 |
>>> if result.success:
|
| 888 |
... print(f"Caption: {result.caption}")
|
| 889 |
... print(f"Lyrics: {result.lyrics}")
|
|
@@ -897,14 +893,6 @@ def create_sample(
|
|
| 897 |
error="LLM not initialized",
|
| 898 |
)
|
| 899 |
|
| 900 |
-
# Validate query
|
| 901 |
-
if not query or not query.strip():
|
| 902 |
-
return CreateSampleResult(
|
| 903 |
-
status_message="No query provided. Please enter a music description.",
|
| 904 |
-
success=False,
|
| 905 |
-
error="Empty query",
|
| 906 |
-
)
|
| 907 |
-
|
| 908 |
try:
|
| 909 |
# Call LLM to create sample
|
| 910 |
metadata, status = llm_handler.create_sample_from_query(
|
|
@@ -982,3 +970,175 @@ def create_sample(
|
|
| 982 |
success=False,
|
| 983 |
error=str(e),
|
| 984 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
llm_handler,
|
| 672 |
audio_codes: str,
|
| 673 |
temperature: float = 0.85,
|
|
|
|
|
|
|
| 674 |
top_k: Optional[int] = None,
|
| 675 |
top_p: Optional[float] = None,
|
| 676 |
repetition_penalty: float = 1.0,
|
|
|
|
| 685 |
If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
|
| 686 |
instead of analyzing existing codes.
|
| 687 |
|
| 688 |
+
Note: cfg_scale and negative_prompt are not supported in understand mode.
|
| 689 |
+
|
| 690 |
Args:
|
| 691 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 692 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 693 |
Use empty string or "NO USER INPUT" to generate a sample example.
|
| 694 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
|
|
|
|
|
|
| 695 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 696 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 697 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
|
|
| 725 |
metadata, status = llm_handler.understand_audio_from_codes(
|
| 726 |
audio_codes=audio_codes,
|
| 727 |
temperature=temperature,
|
|
|
|
|
|
|
| 728 |
top_k=top_k,
|
| 729 |
top_p=top_p,
|
| 730 |
repetition_penalty=repetition_penalty,
|
|
|
|
| 843 |
llm_handler,
|
| 844 |
query: str,
|
| 845 |
instrumental: bool = False,
|
| 846 |
+
vocal_language: Optional[str] = None,
|
| 847 |
temperature: float = 0.85,
|
| 848 |
top_k: Optional[int] = None,
|
| 849 |
top_p: Optional[float] = None,
|
|
|
|
| 865 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 866 |
query: User's natural language music description (e.g., "a soft Bengali love song")
|
| 867 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 868 |
+
vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
|
| 869 |
+
If provided, the model will be constrained to generate lyrics in this language.
|
| 870 |
+
If None or "unknown", no language constraint is applied.
|
| 871 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 872 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 873 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
|
|
|
| 879 |
CreateSampleResult with generated sample fields and status
|
| 880 |
|
| 881 |
Example:
|
| 882 |
+
>>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
|
| 883 |
>>> if result.success:
|
| 884 |
... print(f"Caption: {result.caption}")
|
| 885 |
... print(f"Lyrics: {result.lyrics}")
|
|
|
|
| 893 |
error="LLM not initialized",
|
| 894 |
)
|
| 895 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
try:
|
| 897 |
# Call LLM to create sample
|
| 898 |
metadata, status = llm_handler.create_sample_from_query(
|
|
|
|
| 970 |
success=False,
|
| 971 |
error=str(e),
|
| 972 |
)
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
@dataclass
|
| 976 |
+
class FormatSampleResult:
|
| 977 |
+
"""Result of formatting user-provided caption and lyrics.
|
| 978 |
+
|
| 979 |
+
This is used by the "Format" feature where users provide caption and lyrics,
|
| 980 |
+
and the LLM formats them into structured music metadata and an enhanced description.
|
| 981 |
+
|
| 982 |
+
Attributes:
|
| 983 |
+
# Metadata Fields
|
| 984 |
+
caption: Enhanced/formatted music description/caption
|
| 985 |
+
lyrics: Formatted lyrics (may be same as input or reformatted)
|
| 986 |
+
bpm: Beats per minute (None if not detected)
|
| 987 |
+
duration: Duration in seconds (None if not detected)
|
| 988 |
+
keyscale: Musical key (e.g., "C Major")
|
| 989 |
+
language: Vocal language code (e.g., "en", "zh")
|
| 990 |
+
timesignature: Time signature (e.g., "4")
|
| 991 |
+
|
| 992 |
+
# Status
|
| 993 |
+
status_message: Status message from formatting
|
| 994 |
+
success: Whether formatting completed successfully
|
| 995 |
+
error: Error message if formatting failed
|
| 996 |
+
"""
|
| 997 |
+
# Metadata Fields
|
| 998 |
+
caption: str = ""
|
| 999 |
+
lyrics: str = ""
|
| 1000 |
+
bpm: Optional[int] = None
|
| 1001 |
+
duration: Optional[float] = None
|
| 1002 |
+
keyscale: str = ""
|
| 1003 |
+
language: str = ""
|
| 1004 |
+
timesignature: str = ""
|
| 1005 |
+
|
| 1006 |
+
# Status
|
| 1007 |
+
status_message: str = ""
|
| 1008 |
+
success: bool = True
|
| 1009 |
+
error: Optional[str] = None
|
| 1010 |
+
|
| 1011 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 1012 |
+
"""Convert result to dictionary for JSON serialization."""
|
| 1013 |
+
return asdict(self)
|
| 1014 |
+
|
| 1015 |
+
|
| 1016 |
+
def format_sample(
|
| 1017 |
+
llm_handler,
|
| 1018 |
+
caption: str,
|
| 1019 |
+
lyrics: str,
|
| 1020 |
+
user_metadata: Optional[Dict[str, Any]] = None,
|
| 1021 |
+
temperature: float = 0.85,
|
| 1022 |
+
top_k: Optional[int] = None,
|
| 1023 |
+
top_p: Optional[float] = None,
|
| 1024 |
+
repetition_penalty: float = 1.0,
|
| 1025 |
+
use_constrained_decoding: bool = True,
|
| 1026 |
+
constrained_decoding_debug: bool = False,
|
| 1027 |
+
) -> FormatSampleResult:
|
| 1028 |
+
"""Format user-provided caption and lyrics using the 5Hz Language Model.
|
| 1029 |
+
|
| 1030 |
+
This function takes user input (caption and lyrics) and generates structured
|
| 1031 |
+
music metadata including an enhanced caption, BPM, duration, key, language,
|
| 1032 |
+
and time signature.
|
| 1033 |
+
|
| 1034 |
+
If user_metadata is provided, those values will be used to constrain the
|
| 1035 |
+
decoding, ensuring the output matches user-specified values.
|
| 1036 |
+
|
| 1037 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 1038 |
+
|
| 1039 |
+
Args:
|
| 1040 |
+
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 1041 |
+
caption: User's caption/description (e.g., "Latin pop, reggaeton")
|
| 1042 |
+
lyrics: User's lyrics with structure tags
|
| 1043 |
+
user_metadata: Optional dict with user-provided metadata to constrain decoding.
|
| 1044 |
+
Supported keys: bpm, duration, keyscale, timesignature, language
|
| 1045 |
+
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 1046 |
+
top_k: Top-K sampling (None or 0 = disabled)
|
| 1047 |
+
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 1048 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 1049 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
|
| 1050 |
+
constrained_decoding_debug: Whether to enable debug logging for constrained decoding
|
| 1051 |
+
|
| 1052 |
+
Returns:
|
| 1053 |
+
FormatSampleResult with formatted metadata fields and status
|
| 1054 |
+
|
| 1055 |
+
Example:
|
| 1056 |
+
>>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
|
| 1057 |
+
>>> if result.success:
|
| 1058 |
+
... print(f"Caption: {result.caption}")
|
| 1059 |
+
... print(f"BPM: {result.bpm}")
|
| 1060 |
+
... print(f"Lyrics: {result.lyrics}")
|
| 1061 |
+
"""
|
| 1062 |
+
# Check if LLM is initialized
|
| 1063 |
+
if not llm_handler.llm_initialized:
|
| 1064 |
+
return FormatSampleResult(
|
| 1065 |
+
status_message="5Hz LM not initialized. Please initialize it first.",
|
| 1066 |
+
success=False,
|
| 1067 |
+
error="LLM not initialized",
|
| 1068 |
+
)
|
| 1069 |
+
|
| 1070 |
+
try:
|
| 1071 |
+
# Call LLM formatting
|
| 1072 |
+
metadata, status = llm_handler.format_sample_from_input(
|
| 1073 |
+
caption=caption,
|
| 1074 |
+
lyrics=lyrics,
|
| 1075 |
+
user_metadata=user_metadata,
|
| 1076 |
+
temperature=temperature,
|
| 1077 |
+
top_k=top_k,
|
| 1078 |
+
top_p=top_p,
|
| 1079 |
+
repetition_penalty=repetition_penalty,
|
| 1080 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 1081 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 1082 |
+
)
|
| 1083 |
+
|
| 1084 |
+
# Check if LLM returned empty metadata (error case)
|
| 1085 |
+
if not metadata:
|
| 1086 |
+
return FormatSampleResult(
|
| 1087 |
+
status_message=status or "Failed to format input",
|
| 1088 |
+
success=False,
|
| 1089 |
+
error=status or "Empty metadata returned",
|
| 1090 |
+
)
|
| 1091 |
+
|
| 1092 |
+
# Extract and convert fields
|
| 1093 |
+
result_caption = metadata.get('caption', '')
|
| 1094 |
+
result_lyrics = metadata.get('lyrics', lyrics) # Fall back to input lyrics
|
| 1095 |
+
keyscale = metadata.get('keyscale', '')
|
| 1096 |
+
language = metadata.get('language', metadata.get('vocal_language', ''))
|
| 1097 |
+
timesignature = metadata.get('timesignature', '')
|
| 1098 |
+
|
| 1099 |
+
# Convert BPM to int
|
| 1100 |
+
bpm = None
|
| 1101 |
+
bpm_value = metadata.get('bpm')
|
| 1102 |
+
if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
|
| 1103 |
+
try:
|
| 1104 |
+
bpm = int(bpm_value)
|
| 1105 |
+
except (ValueError, TypeError):
|
| 1106 |
+
pass
|
| 1107 |
+
|
| 1108 |
+
# Convert duration to float
|
| 1109 |
+
duration = None
|
| 1110 |
+
duration_value = metadata.get('duration')
|
| 1111 |
+
if duration_value is not None and duration_value != 'N/A' and duration_value != '':
|
| 1112 |
+
try:
|
| 1113 |
+
duration = float(duration_value)
|
| 1114 |
+
except (ValueError, TypeError):
|
| 1115 |
+
pass
|
| 1116 |
+
|
| 1117 |
+
# Clean up N/A values
|
| 1118 |
+
if keyscale == 'N/A':
|
| 1119 |
+
keyscale = ''
|
| 1120 |
+
if language == 'N/A':
|
| 1121 |
+
language = ''
|
| 1122 |
+
if timesignature == 'N/A':
|
| 1123 |
+
timesignature = ''
|
| 1124 |
+
|
| 1125 |
+
return FormatSampleResult(
|
| 1126 |
+
caption=result_caption,
|
| 1127 |
+
lyrics=result_lyrics,
|
| 1128 |
+
bpm=bpm,
|
| 1129 |
+
duration=duration,
|
| 1130 |
+
keyscale=keyscale,
|
| 1131 |
+
language=language,
|
| 1132 |
+
timesignature=timesignature,
|
| 1133 |
+
status_message=status,
|
| 1134 |
+
success=True,
|
| 1135 |
+
error=None,
|
| 1136 |
+
)
|
| 1137 |
+
|
| 1138 |
+
except Exception as e:
|
| 1139 |
+
logger.exception("Format sample failed")
|
| 1140 |
+
return FormatSampleResult(
|
| 1141 |
+
status_message=f"Error: {str(e)}",
|
| 1142 |
+
success=False,
|
| 1143 |
+
error=str(e),
|
| 1144 |
+
)
|
acestep/llm_inference.py
CHANGED
|
@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
-
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
@@ -1296,8 +1296,6 @@ class LLMHandler:
|
|
| 1296 |
self,
|
| 1297 |
audio_codes: str,
|
| 1298 |
temperature: float = 0.3,
|
| 1299 |
-
cfg_scale: float = 1.0,
|
| 1300 |
-
negative_prompt: str = "NO USER INPUT",
|
| 1301 |
top_k: Optional[int] = None,
|
| 1302 |
top_p: Optional[float] = None,
|
| 1303 |
repetition_penalty: float = 1.0,
|
|
@@ -1306,16 +1304,16 @@ class LLMHandler:
|
|
| 1306 |
) -> Tuple[Dict[str, Any], str]:
|
| 1307 |
"""
|
| 1308 |
Understand audio codes and generate metadata + lyrics.
|
| 1309 |
-
|
| 1310 |
This is the reverse of the normal generation flow:
|
| 1311 |
- Input: Audio codes
|
| 1312 |
- Output: Metadata (bpm, caption, duration, etc.) + Lyrics
|
| 1313 |
-
|
|
|
|
|
|
|
| 1314 |
Args:
|
| 1315 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 1316 |
temperature: Sampling temperature for generation
|
| 1317 |
-
cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
|
| 1318 |
-
negative_prompt: Negative prompt for CFG
|
| 1319 |
top_k: Top-K sampling (None = disabled)
|
| 1320 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1321 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
@@ -1352,12 +1350,11 @@ class LLMHandler:
|
|
| 1352 |
print(f"formatted_prompt: {formatted_prompt}")
|
| 1353 |
# Generate using constrained decoding (understand phase)
|
| 1354 |
# We want to generate metadata first (CoT), then lyrics (natural text)
|
|
|
|
| 1355 |
output_text, status = self.generate_from_formatted_prompt(
|
| 1356 |
formatted_prompt=formatted_prompt,
|
| 1357 |
cfg={
|
| 1358 |
"temperature": temperature,
|
| 1359 |
-
"cfg_scale": cfg_scale,
|
| 1360 |
-
"negative_prompt": negative_prompt,
|
| 1361 |
"top_k": top_k,
|
| 1362 |
"top_p": top_p,
|
| 1363 |
"repetition_penalty": repetition_penalty,
|
|
@@ -1491,7 +1488,7 @@ class LLMHandler:
|
|
| 1491 |
self,
|
| 1492 |
query: str,
|
| 1493 |
instrumental: bool = False,
|
| 1494 |
-
vocal_language: Optional[
|
| 1495 |
temperature: float = 0.85,
|
| 1496 |
top_k: Optional[int] = None,
|
| 1497 |
top_p: Optional[float] = None,
|
|
@@ -1509,8 +1506,8 @@ class LLMHandler:
|
|
| 1509 |
Args:
|
| 1510 |
query: User's natural language music description
|
| 1511 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 1512 |
-
vocal_language:
|
| 1513 |
-
If provided and not
|
| 1514 |
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1515 |
top_k: Top-K sampling (None = disabled)
|
| 1516 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
|
@@ -1532,7 +1529,7 @@ class LLMHandler:
|
|
| 1532 |
|
| 1533 |
Example:
|
| 1534 |
query = "a soft Bengali love song for a quiet evening"
|
| 1535 |
-
metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=
|
| 1536 |
print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
|
| 1537 |
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 1538 |
"""
|
|
@@ -1540,7 +1537,7 @@ class LLMHandler:
|
|
| 1540 |
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1541 |
|
| 1542 |
if not query or not query.strip():
|
| 1543 |
-
|
| 1544 |
|
| 1545 |
logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
|
| 1546 |
|
|
@@ -1554,14 +1551,11 @@ class LLMHandler:
|
|
| 1554 |
# Build user_metadata if vocal_language is specified and is not "unknown"
|
| 1555 |
user_metadata = None
|
| 1556 |
skip_language = False
|
| 1557 |
-
if vocal_language and
|
| 1558 |
-
#
|
| 1559 |
-
|
| 1560 |
-
|
| 1561 |
-
|
| 1562 |
-
user_metadata = {"language": valid_languages[0]}
|
| 1563 |
-
skip_language = True # Skip language generation since we're injecting it
|
| 1564 |
-
logger.info(f"Using user-specified language: {valid_languages[0]}")
|
| 1565 |
|
| 1566 |
# Generate using constrained decoding (inspiration phase)
|
| 1567 |
# Similar to understand mode - generate metadata first (CoT), then lyrics
|
|
@@ -1612,6 +1606,204 @@ class LLMHandler:
|
|
| 1612 |
status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 1613 |
return metadata, status_msg
|
| 1614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1615 |
def generate_from_formatted_prompt(
|
| 1616 |
self,
|
| 1617 |
formatted_prompt: str,
|
|
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
+
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
|
|
| 1296 |
self,
|
| 1297 |
audio_codes: str,
|
| 1298 |
temperature: float = 0.3,
|
|
|
|
|
|
|
| 1299 |
top_k: Optional[int] = None,
|
| 1300 |
top_p: Optional[float] = None,
|
| 1301 |
repetition_penalty: float = 1.0,
|
|
|
|
| 1304 |
) -> Tuple[Dict[str, Any], str]:
|
| 1305 |
"""
|
| 1306 |
Understand audio codes and generate metadata + lyrics.
|
| 1307 |
+
|
| 1308 |
This is the reverse of the normal generation flow:
|
| 1309 |
- Input: Audio codes
|
| 1310 |
- Output: Metadata (bpm, caption, duration, etc.) + Lyrics
|
| 1311 |
+
|
| 1312 |
+
Note: cfg_scale and negative_prompt are not supported in understand mode.
|
| 1313 |
+
|
| 1314 |
Args:
|
| 1315 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 1316 |
temperature: Sampling temperature for generation
|
|
|
|
|
|
|
| 1317 |
top_k: Top-K sampling (None = disabled)
|
| 1318 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1319 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
|
|
| 1350 |
print(f"formatted_prompt: {formatted_prompt}")
|
| 1351 |
# Generate using constrained decoding (understand phase)
|
| 1352 |
# We want to generate metadata first (CoT), then lyrics (natural text)
|
| 1353 |
+
# Note: cfg_scale and negative_prompt are not used in understand mode
|
| 1354 |
output_text, status = self.generate_from_formatted_prompt(
|
| 1355 |
formatted_prompt=formatted_prompt,
|
| 1356 |
cfg={
|
| 1357 |
"temperature": temperature,
|
|
|
|
|
|
|
| 1358 |
"top_k": top_k,
|
| 1359 |
"top_p": top_p,
|
| 1360 |
"repetition_penalty": repetition_penalty,
|
|
|
|
| 1488 |
self,
|
| 1489 |
query: str,
|
| 1490 |
instrumental: bool = False,
|
| 1491 |
+
vocal_language: Optional[str] = None,
|
| 1492 |
temperature: float = 0.85,
|
| 1493 |
top_k: Optional[int] = None,
|
| 1494 |
top_p: Optional[float] = None,
|
|
|
|
| 1506 |
Args:
|
| 1507 |
query: User's natural language music description
|
| 1508 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 1509 |
+
vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
|
| 1510 |
+
If provided and not "unknown", it will be used.
|
| 1511 |
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1512 |
top_k: Top-K sampling (None = disabled)
|
| 1513 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
|
|
|
| 1529 |
|
| 1530 |
Example:
|
| 1531 |
query = "a soft Bengali love song for a quiet evening"
|
| 1532 |
+
metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language="bn")
|
| 1533 |
print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
|
| 1534 |
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 1535 |
"""
|
|
|
|
| 1537 |
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1538 |
|
| 1539 |
if not query or not query.strip():
|
| 1540 |
+
query = "NO USER INPUT"
|
| 1541 |
|
| 1542 |
logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
|
| 1543 |
|
|
|
|
| 1551 |
# Build user_metadata if vocal_language is specified and is not "unknown"
|
| 1552 |
user_metadata = None
|
| 1553 |
skip_language = False
|
| 1554 |
+
if vocal_language and vocal_language.strip() and vocal_language.strip().lower() != "unknown":
|
| 1555 |
+
# Use the specified language for constrained decoding
|
| 1556 |
+
user_metadata = {"language": vocal_language.strip()}
|
| 1557 |
+
skip_language = True # Skip language generation since we're injecting it
|
| 1558 |
+
logger.info(f"Using user-specified language: {vocal_language.strip()}")
|
|
|
|
|
|
|
|
|
|
| 1559 |
|
| 1560 |
# Generate using constrained decoding (inspiration phase)
|
| 1561 |
# Similar to understand mode - generate metadata first (CoT), then lyrics
|
|
|
|
| 1606 |
status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 1607 |
return metadata, status_msg
|
| 1608 |
|
| 1609 |
+
def build_formatted_prompt_for_format(
|
| 1610 |
+
self,
|
| 1611 |
+
caption: str,
|
| 1612 |
+
lyrics: str,
|
| 1613 |
+
is_negative_prompt: bool = False,
|
| 1614 |
+
negative_prompt: str = "NO USER INPUT"
|
| 1615 |
+
) -> str:
|
| 1616 |
+
"""
|
| 1617 |
+
Build the chat-formatted prompt for format/rewrite mode.
|
| 1618 |
+
|
| 1619 |
+
This formats user-provided caption and lyrics into a more detailed and specific
|
| 1620 |
+
musical description with metadata.
|
| 1621 |
+
|
| 1622 |
+
Args:
|
| 1623 |
+
caption: User's caption/description of the music
|
| 1624 |
+
lyrics: User's lyrics
|
| 1625 |
+
is_negative_prompt: If True, builds unconditional prompt for CFG
|
| 1626 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 1627 |
+
|
| 1628 |
+
Returns:
|
| 1629 |
+
Formatted prompt string
|
| 1630 |
+
|
| 1631 |
+
Example:
|
| 1632 |
+
caption = "Latin pop, reggaeton, flamenco-pop"
|
| 1633 |
+
lyrics = "[Verse 1]\\nTengo un nudo..."
|
| 1634 |
+
prompt = handler.build_formatted_prompt_for_format(caption, lyrics)
|
| 1635 |
+
"""
|
| 1636 |
+
if self.llm_tokenizer is None:
|
| 1637 |
+
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 1638 |
+
|
| 1639 |
+
if is_negative_prompt:
|
| 1640 |
+
# For CFG unconditional prompt
|
| 1641 |
+
user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
|
| 1642 |
+
else:
|
| 1643 |
+
# Normal prompt: caption + lyrics
|
| 1644 |
+
user_content = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}"
|
| 1645 |
+
|
| 1646 |
+
return self.llm_tokenizer.apply_chat_template(
|
| 1647 |
+
[
|
| 1648 |
+
{
|
| 1649 |
+
"role": "system",
|
| 1650 |
+
"content": f"# Instruction\n{DEFAULT_LM_REWRITE_INSTRUCTION}\n\n"
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"role": "user",
|
| 1654 |
+
"content": user_content
|
| 1655 |
+
},
|
| 1656 |
+
],
|
| 1657 |
+
tokenize=False,
|
| 1658 |
+
add_generation_prompt=True,
|
| 1659 |
+
)
|
| 1660 |
+
|
| 1661 |
+
def format_sample_from_input(
|
| 1662 |
+
self,
|
| 1663 |
+
caption: str,
|
| 1664 |
+
lyrics: str,
|
| 1665 |
+
user_metadata: Optional[Dict[str, Any]] = None,
|
| 1666 |
+
temperature: float = 0.85,
|
| 1667 |
+
top_k: Optional[int] = None,
|
| 1668 |
+
top_p: Optional[float] = None,
|
| 1669 |
+
repetition_penalty: float = 1.0,
|
| 1670 |
+
use_constrained_decoding: bool = True,
|
| 1671 |
+
constrained_decoding_debug: bool = False,
|
| 1672 |
+
) -> Tuple[Dict[str, Any], str]:
|
| 1673 |
+
"""
|
| 1674 |
+
Format user-provided caption and lyrics into structured music metadata.
|
| 1675 |
+
|
| 1676 |
+
This is the "Format" feature that takes user input and generates:
|
| 1677 |
+
- Enhanced caption with detailed music description
|
| 1678 |
+
- Metadata (bpm, duration, keyscale, language, timesignature)
|
| 1679 |
+
- Formatted lyrics (preserved from input)
|
| 1680 |
+
|
| 1681 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 1682 |
+
|
| 1683 |
+
Args:
|
| 1684 |
+
caption: User's caption/description (e.g., "Latin pop, reggaeton")
|
| 1685 |
+
lyrics: User's lyrics with structure tags
|
| 1686 |
+
user_metadata: Optional dict with user-provided metadata to constrain decoding.
|
| 1687 |
+
Supported keys: bpm, duration, keyscale, timesignature, language
|
| 1688 |
+
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1689 |
+
top_k: Top-K sampling (None = disabled)
|
| 1690 |
+
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1691 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 1692 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 1693 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 1694 |
+
|
| 1695 |
+
Returns:
|
| 1696 |
+
Tuple of (metadata_dict, status_message)
|
| 1697 |
+
metadata_dict contains:
|
| 1698 |
+
- bpm: int or str
|
| 1699 |
+
- caption: str (enhanced)
|
| 1700 |
+
- duration: int or str
|
| 1701 |
+
- keyscale: str
|
| 1702 |
+
- language: str
|
| 1703 |
+
- timesignature: str
|
| 1704 |
+
- lyrics: str (from input, possibly formatted)
|
| 1705 |
+
|
| 1706 |
+
Example:
|
| 1707 |
+
caption = "Latin pop, reggaeton, flamenco-pop"
|
| 1708 |
+
lyrics = "[Verse 1]\\nTengo un nudo en la garganta..."
|
| 1709 |
+
metadata, status = handler.format_sample_from_input(caption, lyrics)
|
| 1710 |
+
print(metadata['caption']) # "A dramatic and powerful Latin pop track..."
|
| 1711 |
+
print(metadata['bpm']) # 100
|
| 1712 |
+
"""
|
| 1713 |
+
if not getattr(self, "llm_initialized", False):
|
| 1714 |
+
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1715 |
+
|
| 1716 |
+
if not caption or not caption.strip():
|
| 1717 |
+
caption = "NO USER INPUT"
|
| 1718 |
+
if not lyrics or not lyrics.strip():
|
| 1719 |
+
lyrics = "[Instrumental]"
|
| 1720 |
+
|
| 1721 |
+
logger.info(f"Formatting sample from input: caption={caption[:50]}..., lyrics length={len(lyrics)}")
|
| 1722 |
+
|
| 1723 |
+
# Build formatted prompt for format task
|
| 1724 |
+
formatted_prompt = self.build_formatted_prompt_for_format(
|
| 1725 |
+
caption=caption,
|
| 1726 |
+
lyrics=lyrics,
|
| 1727 |
+
)
|
| 1728 |
+
logger.debug(f"Formatted prompt for format: {formatted_prompt}")
|
| 1729 |
+
|
| 1730 |
+
# Build constrained decoding metadata from user_metadata
|
| 1731 |
+
constrained_metadata = None
|
| 1732 |
+
if user_metadata:
|
| 1733 |
+
constrained_metadata = {}
|
| 1734 |
+
if user_metadata.get('bpm') is not None:
|
| 1735 |
+
try:
|
| 1736 |
+
bpm_val = int(user_metadata['bpm'])
|
| 1737 |
+
if bpm_val > 0:
|
| 1738 |
+
constrained_metadata['bpm'] = bpm_val
|
| 1739 |
+
except (ValueError, TypeError):
|
| 1740 |
+
pass
|
| 1741 |
+
if user_metadata.get('duration') is not None:
|
| 1742 |
+
try:
|
| 1743 |
+
dur_val = int(user_metadata['duration'])
|
| 1744 |
+
if dur_val > 0:
|
| 1745 |
+
constrained_metadata['duration'] = dur_val
|
| 1746 |
+
except (ValueError, TypeError):
|
| 1747 |
+
pass
|
| 1748 |
+
if user_metadata.get('keyscale'):
|
| 1749 |
+
constrained_metadata['keyscale'] = user_metadata['keyscale']
|
| 1750 |
+
if user_metadata.get('timesignature'):
|
| 1751 |
+
constrained_metadata['timesignature'] = user_metadata['timesignature']
|
| 1752 |
+
if user_metadata.get('language'):
|
| 1753 |
+
constrained_metadata['language'] = user_metadata['language']
|
| 1754 |
+
|
| 1755 |
+
# Only use if we have at least one field
|
| 1756 |
+
if not constrained_metadata:
|
| 1757 |
+
constrained_metadata = None
|
| 1758 |
+
else:
|
| 1759 |
+
logger.info(f"Using user-provided metadata constraints: {constrained_metadata}")
|
| 1760 |
+
|
| 1761 |
+
# Generate using constrained decoding (format phase)
|
| 1762 |
+
# Similar to understand/inspiration mode - generate metadata first (CoT), then formatted lyrics
|
| 1763 |
+
# Note: cfg_scale and negative_prompt are not used in format mode
|
| 1764 |
+
output_text, status = self.generate_from_formatted_prompt(
|
| 1765 |
+
formatted_prompt=formatted_prompt,
|
| 1766 |
+
cfg={
|
| 1767 |
+
"temperature": temperature,
|
| 1768 |
+
"top_k": top_k,
|
| 1769 |
+
"top_p": top_p,
|
| 1770 |
+
"repetition_penalty": repetition_penalty,
|
| 1771 |
+
"target_duration": None, # No duration constraint for generation length
|
| 1772 |
+
"user_metadata": constrained_metadata, # Inject user-provided metadata
|
| 1773 |
+
"skip_caption": False, # Generate caption
|
| 1774 |
+
"skip_language": constrained_metadata.get('language') is not None if constrained_metadata else False,
|
| 1775 |
+
"skip_genres": False, # Generate genres
|
| 1776 |
+
"generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
|
| 1777 |
+
"caption": "",
|
| 1778 |
+
"lyrics": "",
|
| 1779 |
+
},
|
| 1780 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 1781 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 1782 |
+
stop_at_reasoning=False, # Continue after </think> to get formatted lyrics
|
| 1783 |
+
)
|
| 1784 |
+
|
| 1785 |
+
if not output_text:
|
| 1786 |
+
return {}, status
|
| 1787 |
+
|
| 1788 |
+
# Parse metadata and extract lyrics
|
| 1789 |
+
metadata, _ = self.parse_lm_output(output_text)
|
| 1790 |
+
|
| 1791 |
+
# Extract formatted lyrics section (everything after </think>)
|
| 1792 |
+
formatted_lyrics = self._extract_lyrics_from_output(output_text)
|
| 1793 |
+
if formatted_lyrics:
|
| 1794 |
+
metadata['lyrics'] = formatted_lyrics
|
| 1795 |
+
else:
|
| 1796 |
+
# If no lyrics generated, keep original input
|
| 1797 |
+
metadata['lyrics'] = lyrics
|
| 1798 |
+
|
| 1799 |
+
logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
|
| 1800 |
+
if constrained_decoding_debug:
|
| 1801 |
+
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 1802 |
+
logger.debug(f"Output text preview: {output_text[:300]}...")
|
| 1803 |
+
|
| 1804 |
+
status_msg = f"✅ Format completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 1805 |
+
return metadata, status_msg
|
| 1806 |
+
|
| 1807 |
def generate_from_formatted_prompt(
|
| 1808 |
self,
|
| 1809 |
formatted_prompt: str,
|
examples/simple_mode/example_01.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "a soft Bengali love song for a quiet evening",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "a soft Bengali love song for a quiet evening",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "bn"
|
| 5 |
}
|
examples/simple_mode/example_02.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "an upbeat summer pop song with catchy hooks",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "an upbeat summer pop song with catchy hooks",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "en"
|
| 5 |
}
|
examples/simple_mode/example_03.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "epic orchestral cinematic music for a movie trailer",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "epic orchestral cinematic music for a movie trailer",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|
examples/simple_mode/example_04.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "zh"
|
| 5 |
}
|
examples/simple_mode/example_05.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "Japanese city pop with nostalgic 80s vibes",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "Japanese city pop with nostalgic 80s vibes",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "ja"
|
| 5 |
}
|
examples/simple_mode/example_06.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "lo-fi hip hop beats for studying and relaxing",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "lo-fi hip hop beats for studying and relaxing",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|
examples/simple_mode/example_07.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "energetic K-pop dance track with powerful vocals",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "energetic K-pop dance track with powerful vocals",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "ko"
|
| 5 |
}
|
examples/simple_mode/example_08.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "romantic Spanish guitar ballad with heartfelt lyrics",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "romantic Spanish guitar ballad with heartfelt lyrics",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "es"
|
| 5 |
}
|
examples/simple_mode/example_09.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "中国风电子舞曲,融合古典乐器与现代节拍",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "中国风电子舞曲,融合古典乐器与现代节拍",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "zh"
|
| 5 |
}
|
examples/simple_mode/example_10.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "peaceful piano melody for meditation and relaxation",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "peaceful piano melody for meditation and relaxation",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|