Spaces:
Running
on
A100
Running
on
A100
Merge pull request #6 from ace-step/add_simple_mode
Browse files- acestep/gradio_ui/events/__init__.py +69 -2
- acestep/gradio_ui/events/generation_handlers.py +267 -1
- acestep/gradio_ui/i18n/en.json +17 -2
- acestep/gradio_ui/i18n/ja.json +17 -2
- acestep/gradio_ui/i18n/zh.json +17 -2
- acestep/gradio_ui/interfaces/generation.py +74 -6
- acestep/inference.py +185 -0
- acestep/llm_inference.py +181 -2
- examples/simple_mode/example_01.json +5 -0
- examples/simple_mode/example_02.json +5 -0
- examples/simple_mode/example_03.json +5 -0
- examples/simple_mode/example_04.json +5 -0
- examples/simple_mode/example_05.json +5 -0
- examples/simple_mode/example_06.json +5 -0
- examples/simple_mode/example_07.json +5 -0
- examples/simple_mode/example_08.json +5 -0
- examples/simple_mode/example_09.json +5 -0
- examples/simple_mode/example_10.json +5 -0
acestep/gradio_ui/events/__init__.py
CHANGED
|
@@ -121,11 +121,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 121 |
)
|
| 122 |
|
| 123 |
# ========== Sample/Transcribe Handlers ==========
|
|
|
|
| 124 |
generation_section["sample_btn"].click(
|
| 125 |
-
fn=lambda task
|
| 126 |
inputs=[
|
| 127 |
generation_section["task_type"],
|
| 128 |
-
generation_section["constrained_decoding_debug"]
|
| 129 |
],
|
| 130 |
outputs=[
|
| 131 |
generation_section["captions"],
|
|
@@ -190,6 +190,73 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# ========== Load/Save Metadata ==========
|
| 194 |
generation_section["load_file"].upload(
|
| 195 |
fn=gen_h.load_metadata,
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
# ========== Sample/Transcribe Handlers ==========
|
| 124 |
+
# Load random example from ./examples/text2music directory
|
| 125 |
generation_section["sample_btn"].click(
|
| 126 |
+
fn=lambda task: gen_h.load_random_example(task) + (True,),
|
| 127 |
inputs=[
|
| 128 |
generation_section["task_type"],
|
|
|
|
| 129 |
],
|
| 130 |
outputs=[
|
| 131 |
generation_section["captions"],
|
|
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
| 193 |
+
# ========== Simple/Custom Mode Toggle ==========
|
| 194 |
+
generation_section["generation_mode"].change(
|
| 195 |
+
fn=gen_h.handle_generation_mode_change,
|
| 196 |
+
inputs=[generation_section["generation_mode"]],
|
| 197 |
+
outputs=[
|
| 198 |
+
generation_section["simple_mode_group"],
|
| 199 |
+
generation_section["caption_accordion"],
|
| 200 |
+
generation_section["lyrics_accordion"],
|
| 201 |
+
generation_section["generate_btn"],
|
| 202 |
+
generation_section["simple_sample_created"],
|
| 203 |
+
generation_section["optional_params_accordion"],
|
| 204 |
+
]
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# ========== Simple Mode Instrumental Checkbox ==========
|
| 208 |
+
# When instrumental is checked, disable vocal language and set to ["unknown"]
|
| 209 |
+
generation_section["simple_instrumental_checkbox"].change(
|
| 210 |
+
fn=gen_h.handle_simple_instrumental_change,
|
| 211 |
+
inputs=[generation_section["simple_instrumental_checkbox"]],
|
| 212 |
+
outputs=[generation_section["simple_vocal_language"]]
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# ========== Random Description Button ==========
|
| 216 |
+
generation_section["random_desc_btn"].click(
|
| 217 |
+
fn=gen_h.load_random_simple_description,
|
| 218 |
+
inputs=[],
|
| 219 |
+
outputs=[
|
| 220 |
+
generation_section["simple_query_input"],
|
| 221 |
+
generation_section["simple_instrumental_checkbox"],
|
| 222 |
+
generation_section["simple_vocal_language"],
|
| 223 |
+
]
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# ========== Create Sample Button (Simple Mode) ==========
|
| 227 |
+
# Note: cfg_scale and negative_prompt are not supported in create_sample mode
|
| 228 |
+
generation_section["create_sample_btn"].click(
|
| 229 |
+
fn=lambda query, instrumental, vocal_lang, temp, top_k, top_p, debug: gen_h.handle_create_sample(
|
| 230 |
+
llm_handler, query, instrumental, vocal_lang, temp, top_k, top_p, debug
|
| 231 |
+
),
|
| 232 |
+
inputs=[
|
| 233 |
+
generation_section["simple_query_input"],
|
| 234 |
+
generation_section["simple_instrumental_checkbox"],
|
| 235 |
+
generation_section["simple_vocal_language"],
|
| 236 |
+
generation_section["lm_temperature"],
|
| 237 |
+
generation_section["lm_top_k"],
|
| 238 |
+
generation_section["lm_top_p"],
|
| 239 |
+
generation_section["constrained_decoding_debug"],
|
| 240 |
+
],
|
| 241 |
+
outputs=[
|
| 242 |
+
generation_section["captions"],
|
| 243 |
+
generation_section["lyrics"],
|
| 244 |
+
generation_section["bpm"],
|
| 245 |
+
generation_section["audio_duration"],
|
| 246 |
+
generation_section["key_scale"],
|
| 247 |
+
generation_section["vocal_language"],
|
| 248 |
+
generation_section["time_signature"],
|
| 249 |
+
generation_section["instrumental_checkbox"],
|
| 250 |
+
generation_section["caption_accordion"],
|
| 251 |
+
generation_section["lyrics_accordion"],
|
| 252 |
+
generation_section["generate_btn"],
|
| 253 |
+
generation_section["simple_sample_created"],
|
| 254 |
+
generation_section["think_checkbox"],
|
| 255 |
+
results_section["is_format_caption_state"],
|
| 256 |
+
results_section["status_output"],
|
| 257 |
+
]
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
# ========== Load/Save Metadata ==========
|
| 261 |
generation_section["load_file"].upload(
|
| 262 |
fn=gen_h.load_metadata,
|
acestep/gradio_ui/events/generation_handlers.py
CHANGED
|
@@ -13,7 +13,7 @@ from acestep.constants import (
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
-
from acestep.inference import understand_music
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
@@ -254,6 +254,65 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
|
|
| 254 |
return load_random_example(task_type)
|
| 255 |
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
def refresh_checkpoints(dit_handler):
|
| 258 |
"""Refresh available checkpoints"""
|
| 259 |
choices = dit_handler.get_available_checkpoints()
|
|
@@ -502,6 +561,24 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
|
|
| 502 |
return current_lyrics
|
| 503 |
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
def update_audio_components_visibility(batch_size):
|
| 506 |
"""Show/hide individual audio components based on batch size (1-8)
|
| 507 |
|
|
@@ -532,3 +609,192 @@ def update_audio_components_visibility(batch_size):
|
|
| 532 |
return updates_row1 + updates_row2
|
| 533 |
|
| 534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
+
from acestep.inference import understand_music, create_sample
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
|
|
| 254 |
return load_random_example(task_type)
|
| 255 |
|
| 256 |
|
| 257 |
+
def load_random_simple_description():
|
| 258 |
+
"""Load a random description from the simple_mode examples directory.
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
Tuple of (description, instrumental, vocal_language) for updating UI components
|
| 262 |
+
"""
|
| 263 |
+
try:
|
| 264 |
+
# Get the project root directory
|
| 265 |
+
current_file = os.path.abspath(__file__)
|
| 266 |
+
# This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
|
| 267 |
+
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
| 268 |
+
|
| 269 |
+
# Construct the examples directory path
|
| 270 |
+
examples_dir = os.path.join(project_root, "examples", "simple_mode")
|
| 271 |
+
|
| 272 |
+
# Check if directory exists
|
| 273 |
+
if not os.path.exists(examples_dir):
|
| 274 |
+
gr.Warning(t("messages.simple_examples_not_found"))
|
| 275 |
+
return gr.update(), gr.update(), gr.update()
|
| 276 |
+
|
| 277 |
+
# Find all JSON files in the directory
|
| 278 |
+
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 279 |
+
|
| 280 |
+
if not json_files:
|
| 281 |
+
gr.Warning(t("messages.simple_examples_empty"))
|
| 282 |
+
return gr.update(), gr.update(), gr.update()
|
| 283 |
+
|
| 284 |
+
# Randomly select one file
|
| 285 |
+
selected_file = random.choice(json_files)
|
| 286 |
+
|
| 287 |
+
# Read and parse JSON
|
| 288 |
+
try:
|
| 289 |
+
with open(selected_file, 'r', encoding='utf-8') as f:
|
| 290 |
+
data = json.load(f)
|
| 291 |
+
|
| 292 |
+
# Extract fields
|
| 293 |
+
description = data.get('description', '')
|
| 294 |
+
instrumental = data.get('instrumental', False)
|
| 295 |
+
vocal_language = data.get('vocal_language', ['unknown'])
|
| 296 |
+
|
| 297 |
+
# Ensure vocal_language is a list
|
| 298 |
+
if isinstance(vocal_language, str):
|
| 299 |
+
vocal_language = [vocal_language]
|
| 300 |
+
|
| 301 |
+
gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
|
| 302 |
+
return description, instrumental, vocal_language
|
| 303 |
+
|
| 304 |
+
except json.JSONDecodeError as e:
|
| 305 |
+
gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
|
| 306 |
+
return gr.update(), gr.update(), gr.update()
|
| 307 |
+
except Exception as e:
|
| 308 |
+
gr.Warning(t("messages.example_error", error=str(e)))
|
| 309 |
+
return gr.update(), gr.update(), gr.update()
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
gr.Warning(t("messages.example_error", error=str(e)))
|
| 313 |
+
return gr.update(), gr.update(), gr.update()
|
| 314 |
+
|
| 315 |
+
|
| 316 |
def refresh_checkpoints(dit_handler):
|
| 317 |
"""Refresh available checkpoints"""
|
| 318 |
choices = dit_handler.get_available_checkpoints()
|
|
|
|
| 561 |
return current_lyrics
|
| 562 |
|
| 563 |
|
| 564 |
+
def handle_simple_instrumental_change(is_instrumental: bool):
|
| 565 |
+
"""
|
| 566 |
+
Handle simple mode instrumental checkbox changes.
|
| 567 |
+
When checked: set vocal_language to ["unknown"] and disable editing.
|
| 568 |
+
When unchecked: enable vocal_language editing.
|
| 569 |
+
|
| 570 |
+
Args:
|
| 571 |
+
is_instrumental: Whether instrumental checkbox is checked
|
| 572 |
+
|
| 573 |
+
Returns:
|
| 574 |
+
gr.update for simple_vocal_language dropdown
|
| 575 |
+
"""
|
| 576 |
+
if is_instrumental:
|
| 577 |
+
return gr.update(value=["unknown"], interactive=False)
|
| 578 |
+
else:
|
| 579 |
+
return gr.update(interactive=True)
|
| 580 |
+
|
| 581 |
+
|
| 582 |
def update_audio_components_visibility(batch_size):
|
| 583 |
"""Show/hide individual audio components based on batch size (1-8)
|
| 584 |
|
|
|
|
| 609 |
return updates_row1 + updates_row2
|
| 610 |
|
| 611 |
|
| 612 |
+
def handle_generation_mode_change(mode: str):
|
| 613 |
+
"""
|
| 614 |
+
Handle generation mode change between Simple and Custom modes.
|
| 615 |
+
|
| 616 |
+
In Simple mode:
|
| 617 |
+
- Show simple mode group (query input, instrumental checkbox, create button)
|
| 618 |
+
- Collapse caption and lyrics accordions
|
| 619 |
+
- Hide optional parameters accordion
|
| 620 |
+
- Disable generate button until sample is created
|
| 621 |
+
|
| 622 |
+
In Custom mode:
|
| 623 |
+
- Hide simple mode group
|
| 624 |
+
- Expand caption and lyrics accordions
|
| 625 |
+
- Show optional parameters accordion
|
| 626 |
+
- Enable generate button
|
| 627 |
+
|
| 628 |
+
Args:
|
| 629 |
+
mode: "simple" or "custom"
|
| 630 |
+
|
| 631 |
+
Returns:
|
| 632 |
+
Tuple of updates for:
|
| 633 |
+
- simple_mode_group (visibility)
|
| 634 |
+
- caption_accordion (open state)
|
| 635 |
+
- lyrics_accordion (open state)
|
| 636 |
+
- generate_btn (interactive state)
|
| 637 |
+
- simple_sample_created (reset state)
|
| 638 |
+
- optional_params_accordion (visibility)
|
| 639 |
+
"""
|
| 640 |
+
is_simple = mode == "simple"
|
| 641 |
+
|
| 642 |
+
return (
|
| 643 |
+
gr.update(visible=is_simple), # simple_mode_group
|
| 644 |
+
gr.update(open=not is_simple), # caption_accordion - collapsed in simple, open in custom
|
| 645 |
+
gr.update(open=not is_simple), # lyrics_accordion - collapsed in simple, open in custom
|
| 646 |
+
gr.update(interactive=not is_simple), # generate_btn - disabled in simple until sample created
|
| 647 |
+
False, # simple_sample_created - reset to False on mode change
|
| 648 |
+
gr.update(open=not is_simple), # optional_params_accordion - hidden in simple mode
|
| 649 |
+
)
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
def handle_create_sample(
|
| 653 |
+
llm_handler,
|
| 654 |
+
query: str,
|
| 655 |
+
instrumental: bool,
|
| 656 |
+
vocal_language: list,
|
| 657 |
+
lm_temperature: float,
|
| 658 |
+
lm_top_k: int,
|
| 659 |
+
lm_top_p: float,
|
| 660 |
+
constrained_decoding_debug: bool = False,
|
| 661 |
+
):
|
| 662 |
+
"""
|
| 663 |
+
Handle the Create Sample button click in Simple mode.
|
| 664 |
+
|
| 665 |
+
Creates a sample from the user's query using the LLM, then populates
|
| 666 |
+
the caption, lyrics, and metadata fields.
|
| 667 |
+
|
| 668 |
+
Note: cfg_scale and negative_prompt are not supported in create_sample mode.
|
| 669 |
+
|
| 670 |
+
Args:
|
| 671 |
+
llm_handler: LLM handler instance
|
| 672 |
+
query: User's natural language music description
|
| 673 |
+
instrumental: Whether to generate instrumental music
|
| 674 |
+
vocal_language: List of preferred vocal languages for constrained decoding
|
| 675 |
+
lm_temperature: LLM temperature for generation
|
| 676 |
+
lm_top_k: LLM top-k sampling
|
| 677 |
+
lm_top_p: LLM top-p sampling
|
| 678 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 679 |
+
|
| 680 |
+
Returns:
|
| 681 |
+
Tuple of updates for:
|
| 682 |
+
- captions
|
| 683 |
+
- lyrics
|
| 684 |
+
- bpm
|
| 685 |
+
- audio_duration
|
| 686 |
+
- key_scale
|
| 687 |
+
- vocal_language
|
| 688 |
+
- time_signature
|
| 689 |
+
- instrumental_checkbox
|
| 690 |
+
- caption_accordion (open)
|
| 691 |
+
- lyrics_accordion (open)
|
| 692 |
+
- generate_btn (interactive)
|
| 693 |
+
- simple_sample_created (True)
|
| 694 |
+
- think_checkbox (True)
|
| 695 |
+
- is_format_caption_state (True)
|
| 696 |
+
- status_output
|
| 697 |
+
"""
|
| 698 |
+
# Validate query
|
| 699 |
+
if not query or not query.strip():
|
| 700 |
+
gr.Warning(t("messages.empty_query"))
|
| 701 |
+
return (
|
| 702 |
+
gr.update(), # captions - no change
|
| 703 |
+
gr.update(), # lyrics - no change
|
| 704 |
+
gr.update(), # bpm - no change
|
| 705 |
+
gr.update(), # audio_duration - no change
|
| 706 |
+
gr.update(), # key_scale - no change
|
| 707 |
+
gr.update(), # vocal_language - no change
|
| 708 |
+
gr.update(), # time_signature - no change
|
| 709 |
+
gr.update(), # instrumental_checkbox - no change
|
| 710 |
+
gr.update(), # caption_accordion - no change
|
| 711 |
+
gr.update(), # lyrics_accordion - no change
|
| 712 |
+
gr.update(interactive=False), # generate_btn - keep disabled
|
| 713 |
+
False, # simple_sample_created - still False
|
| 714 |
+
gr.update(), # think_checkbox - no change
|
| 715 |
+
gr.update(), # is_format_caption_state - no change
|
| 716 |
+
t("messages.empty_query"), # status_output
|
| 717 |
+
)
|
| 718 |
+
|
| 719 |
+
# Check if LLM is initialized
|
| 720 |
+
if not llm_handler.llm_initialized:
|
| 721 |
+
gr.Warning(t("messages.lm_not_initialized"))
|
| 722 |
+
return (
|
| 723 |
+
gr.update(), # captions - no change
|
| 724 |
+
gr.update(), # lyrics - no change
|
| 725 |
+
gr.update(), # bpm - no change
|
| 726 |
+
gr.update(), # audio_duration - no change
|
| 727 |
+
gr.update(), # key_scale - no change
|
| 728 |
+
gr.update(), # vocal_language - no change
|
| 729 |
+
gr.update(), # time_signature - no change
|
| 730 |
+
gr.update(), # instrumental_checkbox - no change
|
| 731 |
+
gr.update(), # caption_accordion - no change
|
| 732 |
+
gr.update(), # lyrics_accordion - no change
|
| 733 |
+
gr.update(interactive=False), # generate_btn - keep disabled
|
| 734 |
+
False, # simple_sample_created - still False
|
| 735 |
+
gr.update(), # think_checkbox - no change
|
| 736 |
+
gr.update(), # is_format_caption_state - no change
|
| 737 |
+
t("messages.lm_not_initialized"), # status_output
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
# Convert LM parameters
|
| 741 |
+
top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
|
| 742 |
+
top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
|
| 743 |
+
|
| 744 |
+
# Call create_sample API
|
| 745 |
+
# Note: cfg_scale and negative_prompt are not supported in create_sample mode
|
| 746 |
+
result = create_sample(
|
| 747 |
+
llm_handler=llm_handler,
|
| 748 |
+
query=query,
|
| 749 |
+
instrumental=instrumental,
|
| 750 |
+
vocal_language=vocal_language,
|
| 751 |
+
temperature=lm_temperature,
|
| 752 |
+
top_k=top_k_value,
|
| 753 |
+
top_p=top_p_value,
|
| 754 |
+
use_constrained_decoding=True,
|
| 755 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 756 |
+
)
|
| 757 |
+
|
| 758 |
+
# Handle error
|
| 759 |
+
if not result.success:
|
| 760 |
+
gr.Warning(result.status_message or t("messages.sample_creation_failed"))
|
| 761 |
+
return (
|
| 762 |
+
gr.update(), # captions - no change
|
| 763 |
+
gr.update(), # lyrics - no change
|
| 764 |
+
gr.update(), # bpm - no change
|
| 765 |
+
gr.update(), # audio_duration - no change
|
| 766 |
+
gr.update(), # key_scale - no change
|
| 767 |
+
gr.update(), # vocal_language - no change
|
| 768 |
+
gr.update(), # time_signature - no change
|
| 769 |
+
gr.update(), # instrumental_checkbox - no change
|
| 770 |
+
gr.update(), # caption_accordion - no change
|
| 771 |
+
gr.update(), # lyrics_accordion - no change
|
| 772 |
+
gr.update(interactive=False), # generate_btn - keep disabled
|
| 773 |
+
False, # simple_sample_created - still False
|
| 774 |
+
gr.update(), # think_checkbox - no change
|
| 775 |
+
gr.update(), # is_format_caption_state - no change
|
| 776 |
+
result.status_message or t("messages.sample_creation_failed"), # status_output
|
| 777 |
+
)
|
| 778 |
+
|
| 779 |
+
# Success - populate fields
|
| 780 |
+
gr.Info(t("messages.sample_created"))
|
| 781 |
+
|
| 782 |
+
return (
|
| 783 |
+
result.caption, # captions
|
| 784 |
+
result.lyrics, # lyrics
|
| 785 |
+
result.bpm, # bpm
|
| 786 |
+
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 787 |
+
result.keyscale, # key_scale
|
| 788 |
+
result.language, # vocal_language
|
| 789 |
+
result.timesignature, # time_signature
|
| 790 |
+
result.instrumental, # instrumental_checkbox
|
| 791 |
+
gr.update(open=True), # caption_accordion - expand
|
| 792 |
+
gr.update(open=True), # lyrics_accordion - expand
|
| 793 |
+
gr.update(interactive=True), # generate_btn - enable
|
| 794 |
+
True, # simple_sample_created - True
|
| 795 |
+
True, # think_checkbox - enable thinking
|
| 796 |
+
True, # is_format_caption_state - True (LM-generated)
|
| 797 |
+
result.status_message, # status_output
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
|
acestep/gradio_ui/i18n/en.json
CHANGED
|
@@ -79,11 +79,20 @@
|
|
| 79 |
"repainting_controls": "🎨 Repainting Controls (seconds)",
|
| 80 |
"repainting_start": "Repainting Start",
|
| 81 |
"repainting_end": "Repainting End",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"caption_title": "📝 Music Caption",
|
| 83 |
"caption_label": "Music Caption (optional)",
|
| 84 |
"caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
|
| 85 |
"caption_info": "Describe the style, genre, instruments, and mood",
|
| 86 |
-
"sample_btn": "Sample",
|
| 87 |
"lyrics_title": "📝 Lyrics",
|
| 88 |
"lyrics_label": "Lyrics (optional)",
|
| 89 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
|
@@ -212,6 +221,12 @@
|
|
| 212 |
"lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
|
| 213 |
"lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
|
| 214 |
"lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
|
| 215 |
-
"lrc_empty_result": "⚠️ LRC generation produced empty result."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
}
|
| 217 |
}
|
|
|
|
| 79 |
"repainting_controls": "🎨 Repainting Controls (seconds)",
|
| 80 |
"repainting_start": "Repainting Start",
|
| 81 |
"repainting_end": "Repainting End",
|
| 82 |
+
"mode_label": "Generation Mode",
|
| 83 |
+
"mode_info": "Simple: describe music in natural language. Custom: full control over caption and lyrics.",
|
| 84 |
+
"mode_simple": "Simple",
|
| 85 |
+
"mode_custom": "Custom",
|
| 86 |
+
"simple_query_label": "Song Description",
|
| 87 |
+
"simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
|
| 88 |
+
"simple_query_info": "Enter a natural language description of the music you want to generate",
|
| 89 |
+
"simple_vocal_language_label": "Vocal Language (optional)",
|
| 90 |
+
"simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
|
| 91 |
+
"create_sample_btn": "Create Sample",
|
| 92 |
"caption_title": "📝 Music Caption",
|
| 93 |
"caption_label": "Music Caption (optional)",
|
| 94 |
"caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
|
| 95 |
"caption_info": "Describe the style, genre, instruments, and mood",
|
|
|
|
| 96 |
"lyrics_title": "📝 Lyrics",
|
| 97 |
"lyrics_label": "Lyrics (optional)",
|
| 98 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
|
|
|
| 221 |
"lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
|
| 222 |
"lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
|
| 223 |
"lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
|
| 224 |
+
"lrc_empty_result": "⚠️ LRC generation produced empty result.",
|
| 225 |
+
"empty_query": "⚠️ Please enter a music description.",
|
| 226 |
+
"sample_creation_failed": "❌ Failed to create sample. Please try again.",
|
| 227 |
+
"sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
|
| 228 |
+
"simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
|
| 229 |
+
"simple_examples_empty": "⚠️ No example files found in simple mode examples.",
|
| 230 |
+
"simple_example_loaded": "🎲 Loaded random example from {filename}"
|
| 231 |
}
|
| 232 |
}
|
acestep/gradio_ui/i18n/ja.json
CHANGED
|
@@ -79,11 +79,20 @@
|
|
| 79 |
"repainting_controls": "🎨 再描画コントロール(秒)",
|
| 80 |
"repainting_start": "再描画開始",
|
| 81 |
"repainting_end": "再描画終了",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"caption_title": "📝 音楽キャプション",
|
| 83 |
"caption_label": "音楽キャプション(オプション)",
|
| 84 |
"caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
|
| 85 |
"caption_info": "スタイル、ジャンル、楽器、ムードを説明",
|
| 86 |
-
"sample_btn": "サンプル",
|
| 87 |
"lyrics_title": "📝 歌詞",
|
| 88 |
"lyrics_label": "歌詞(オプション)",
|
| 89 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
|
@@ -212,6 +221,12 @@
|
|
| 212 |
"lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
|
| 213 |
"lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
|
| 214 |
"lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
|
| 215 |
-
"lrc_empty_result": "⚠️ LRC生成の結果が空です。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
}
|
| 217 |
}
|
|
|
|
| 79 |
"repainting_controls": "🎨 再描画コントロール(秒)",
|
| 80 |
"repainting_start": "再描画開始",
|
| 81 |
"repainting_end": "再描画終了",
|
| 82 |
+
"mode_label": "生成モード",
|
| 83 |
+
"mode_info": "シンプル:自然言語で音楽を説明。カスタム:キャプションと歌詞を完全にコントロール。",
|
| 84 |
+
"mode_simple": "シンプル",
|
| 85 |
+
"mode_custom": "カスタム",
|
| 86 |
+
"simple_query_label": "曲の説明",
|
| 87 |
+
"simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'",
|
| 88 |
+
"simple_query_info": "生成したい音楽の自然言語の説明を入力",
|
| 89 |
+
"simple_vocal_language_label": "ボーカル言語(オプション)",
|
| 90 |
+
"simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
|
| 91 |
+
"create_sample_btn": "サンプル作成",
|
| 92 |
"caption_title": "📝 音楽キャプション",
|
| 93 |
"caption_label": "音楽キャプション(オプション)",
|
| 94 |
"caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
|
| 95 |
"caption_info": "スタイル、ジャンル、楽器、ムードを説明",
|
|
|
|
| 96 |
"lyrics_title": "📝 歌詞",
|
| 97 |
"lyrics_label": "歌詞(オプション)",
|
| 98 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
|
|
|
| 221 |
"lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
|
| 222 |
"lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
|
| 223 |
"lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
|
| 224 |
+
"lrc_empty_result": "⚠️ LRC生成の結果が空です。",
|
| 225 |
+
"empty_query": "⚠️ 音楽の説明を入力してください。",
|
| 226 |
+
"sample_creation_failed": "❌ サンプルの作成に失敗しました。もう一度お試しください。",
|
| 227 |
+
"sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
|
| 228 |
+
"simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
|
| 229 |
+
"simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
|
| 230 |
+
"simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
|
| 231 |
}
|
| 232 |
}
|
acestep/gradio_ui/i18n/zh.json
CHANGED
|
@@ -79,11 +79,20 @@
|
|
| 79 |
"repainting_controls": "🎨 重绘控制(秒)",
|
| 80 |
"repainting_start": "重绘开始",
|
| 81 |
"repainting_end": "重绘结束",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"caption_title": "📝 音乐描述",
|
| 83 |
"caption_label": "音乐描述(可选)",
|
| 84 |
"caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
|
| 85 |
"caption_info": "描述风格、流派、乐器和情绪",
|
| 86 |
-
"sample_btn": "示例",
|
| 87 |
"lyrics_title": "📝 歌词",
|
| 88 |
"lyrics_label": "歌词(可选)",
|
| 89 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
|
@@ -212,6 +221,12 @@
|
|
| 212 |
"lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
|
| 213 |
"lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
|
| 214 |
"lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
|
| 215 |
-
"lrc_empty_result": "⚠️ LRC生成结果为空。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
}
|
| 217 |
}
|
|
|
|
| 79 |
"repainting_controls": "🎨 重绘控制(秒)",
|
| 80 |
"repainting_start": "重绘开始",
|
| 81 |
"repainting_end": "重绘结束",
|
| 82 |
+
"mode_label": "生成模式",
|
| 83 |
+
"mode_info": "简单模式:用自然语言描述音乐。自定义模式:完全控制描述和歌词。",
|
| 84 |
+
"mode_simple": "简单",
|
| 85 |
+
"mode_custom": "自定义",
|
| 86 |
+
"simple_query_label": "歌曲描述",
|
| 87 |
+
"simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'",
|
| 88 |
+
"simple_query_info": "输入你想生成的音乐的自然语言描述",
|
| 89 |
+
"simple_vocal_language_label": "人声语言(可选)",
|
| 90 |
+
"simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
|
| 91 |
+
"create_sample_btn": "创建样本",
|
| 92 |
"caption_title": "📝 音乐描述",
|
| 93 |
"caption_label": "音乐描述(可选)",
|
| 94 |
"caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
|
| 95 |
"caption_info": "描述风格、流派、乐器和情绪",
|
|
|
|
| 96 |
"lyrics_title": "📝 歌词",
|
| 97 |
"lyrics_label": "歌词(可选)",
|
| 98 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
|
|
|
| 221 |
"lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
|
| 222 |
"lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
|
| 223 |
"lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
|
| 224 |
+
"lrc_empty_result": "⚠️ LRC生成结果为空。",
|
| 225 |
+
"empty_query": "⚠️ 请输入音乐描述。",
|
| 226 |
+
"sample_creation_failed": "❌ 创建样本失败。请重试。",
|
| 227 |
+
"sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
|
| 228 |
+
"simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
|
| 229 |
+
"simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
|
| 230 |
+
"simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
|
| 231 |
}
|
| 232 |
}
|
acestep/gradio_ui/interfaces/generation.py
CHANGED
|
@@ -250,9 +250,64 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 250 |
minimum=-1,
|
| 251 |
step=0.1,
|
| 252 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
# Music Caption
|
| 255 |
-
with gr.Accordion(t("generation.caption_title"), open=
|
| 256 |
with gr.Row(equal_height=True):
|
| 257 |
captions = gr.Textbox(
|
| 258 |
label=t("generation.caption_label"),
|
|
@@ -262,14 +317,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 262 |
scale=9,
|
| 263 |
)
|
| 264 |
sample_btn = gr.Button(
|
| 265 |
-
|
| 266 |
variant="secondary",
|
| 267 |
size="sm",
|
| 268 |
scale=1,
|
| 269 |
)
|
| 270 |
|
| 271 |
-
# Lyrics
|
| 272 |
-
with gr.Accordion(t("generation.lyrics_title"), open=
|
| 273 |
lyrics = gr.Textbox(
|
| 274 |
label=t("generation.lyrics_label"),
|
| 275 |
placeholder=t("generation.lyrics_placeholder"),
|
|
@@ -283,7 +338,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 283 |
)
|
| 284 |
|
| 285 |
# Optional Parameters
|
| 286 |
-
with gr.Accordion(t("generation.optional_params"), open=
|
| 287 |
with gr.Row():
|
| 288 |
vocal_language = gr.Dropdown(
|
| 289 |
choices=VALID_LANGUAGES,
|
|
@@ -587,6 +642,19 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 587 |
"repainting_start": repainting_start,
|
| 588 |
"repainting_end": repainting_end,
|
| 589 |
"audio_cover_strength": audio_cover_strength,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
"captions": captions,
|
| 591 |
"sample_btn": sample_btn,
|
| 592 |
"load_file": load_file,
|
|
|
|
| 250 |
minimum=-1,
|
| 251 |
step=0.1,
|
| 252 |
)
|
| 253 |
+
|
| 254 |
+
# Simple/Custom Mode Toggle
|
| 255 |
+
with gr.Row():
|
| 256 |
+
generation_mode = gr.Radio(
|
| 257 |
+
choices=[
|
| 258 |
+
(t("generation.mode_simple"), "simple"),
|
| 259 |
+
(t("generation.mode_custom"), "custom"),
|
| 260 |
+
],
|
| 261 |
+
value="simple",
|
| 262 |
+
label=t("generation.mode_label"),
|
| 263 |
+
info=t("generation.mode_info"),
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# Simple Mode Components - visible only in Simple mode
|
| 267 |
+
with gr.Group(visible=True) as simple_mode_group:
|
| 268 |
+
with gr.Row(equal_height=True):
|
| 269 |
+
simple_query_input = gr.Textbox(
|
| 270 |
+
label=t("generation.simple_query_label"),
|
| 271 |
+
placeholder=t("generation.simple_query_placeholder"),
|
| 272 |
+
lines=2,
|
| 273 |
+
info=t("generation.simple_query_info"),
|
| 274 |
+
scale=12,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
with gr.Column(scale=1, min_width=100):
|
| 278 |
+
random_desc_btn = gr.Button(
|
| 279 |
+
"🎲",
|
| 280 |
+
variant="secondary",
|
| 281 |
+
size="sm",
|
| 282 |
+
scale=2
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
with gr.Row(equal_height=True):
|
| 286 |
+
with gr.Column(scale=1, variant="compact"):
|
| 287 |
+
simple_instrumental_checkbox = gr.Checkbox(
|
| 288 |
+
label=t("generation.instrumental_label"),
|
| 289 |
+
value=False,
|
| 290 |
+
)
|
| 291 |
+
with gr.Column(scale=18):
|
| 292 |
+
create_sample_btn = gr.Button(
|
| 293 |
+
t("generation.create_sample_btn"),
|
| 294 |
+
variant="primary",
|
| 295 |
+
size="lg",
|
| 296 |
+
)
|
| 297 |
+
with gr.Column(scale=1, variant="compact"):
|
| 298 |
+
simple_vocal_language = gr.Dropdown(
|
| 299 |
+
choices=VALID_LANGUAGES,
|
| 300 |
+
value="unknown",
|
| 301 |
+
allow_custom_value=True,
|
| 302 |
+
label=t("generation.simple_vocal_language_label"),
|
| 303 |
+
interactive=True,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# State to track if sample has been created in Simple mode
|
| 307 |
+
simple_sample_created = gr.State(value=False)
|
| 308 |
|
| 309 |
+
# Music Caption - wrapped in accordion that can be collapsed in Simple mode
|
| 310 |
+
with gr.Accordion(t("generation.caption_title"), open=False) as caption_accordion:
|
| 311 |
with gr.Row(equal_height=True):
|
| 312 |
captions = gr.Textbox(
|
| 313 |
label=t("generation.caption_label"),
|
|
|
|
| 317 |
scale=9,
|
| 318 |
)
|
| 319 |
sample_btn = gr.Button(
|
| 320 |
+
"🎲",
|
| 321 |
variant="secondary",
|
| 322 |
size="sm",
|
| 323 |
scale=1,
|
| 324 |
)
|
| 325 |
|
| 326 |
+
# Lyrics - wrapped in accordion that can be collapsed in Simple mode
|
| 327 |
+
with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
|
| 328 |
lyrics = gr.Textbox(
|
| 329 |
label=t("generation.lyrics_label"),
|
| 330 |
placeholder=t("generation.lyrics_placeholder"),
|
|
|
|
| 338 |
)
|
| 339 |
|
| 340 |
# Optional Parameters
|
| 341 |
+
with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
|
| 342 |
with gr.Row():
|
| 343 |
vocal_language = gr.Dropdown(
|
| 344 |
choices=VALID_LANGUAGES,
|
|
|
|
| 642 |
"repainting_start": repainting_start,
|
| 643 |
"repainting_end": repainting_end,
|
| 644 |
"audio_cover_strength": audio_cover_strength,
|
| 645 |
+
# Simple/Custom Mode Components
|
| 646 |
+
"generation_mode": generation_mode,
|
| 647 |
+
"simple_mode_group": simple_mode_group,
|
| 648 |
+
"simple_query_input": simple_query_input,
|
| 649 |
+
"random_desc_btn": random_desc_btn,
|
| 650 |
+
"simple_instrumental_checkbox": simple_instrumental_checkbox,
|
| 651 |
+
"simple_vocal_language": simple_vocal_language,
|
| 652 |
+
"create_sample_btn": create_sample_btn,
|
| 653 |
+
"simple_sample_created": simple_sample_created,
|
| 654 |
+
"caption_accordion": caption_accordion,
|
| 655 |
+
"lyrics_accordion": lyrics_accordion,
|
| 656 |
+
"optional_params_accordion": optional_params_accordion,
|
| 657 |
+
# Existing components
|
| 658 |
"captions": captions,
|
| 659 |
"sample_btn": sample_btn,
|
| 660 |
"load_file": load_file,
|
acestep/inference.py
CHANGED
|
@@ -797,3 +797,188 @@ def understand_music(
|
|
| 797 |
success=False,
|
| 798 |
error=str(e),
|
| 799 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 797 |
success=False,
|
| 798 |
error=str(e),
|
| 799 |
)
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
@dataclass
|
| 803 |
+
class CreateSampleResult:
|
| 804 |
+
"""Result of creating a music sample from a natural language query.
|
| 805 |
+
|
| 806 |
+
This is used by the "Simple Mode" / "Inspiration Mode" feature where users
|
| 807 |
+
provide a natural language description and the LLM generates a complete
|
| 808 |
+
sample with caption, lyrics, and metadata.
|
| 809 |
+
|
| 810 |
+
Attributes:
|
| 811 |
+
# Metadata Fields
|
| 812 |
+
caption: Generated detailed music description/caption
|
| 813 |
+
lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
|
| 814 |
+
bpm: Beats per minute (None if not generated)
|
| 815 |
+
duration: Duration in seconds (None if not generated)
|
| 816 |
+
keyscale: Musical key (e.g., "C Major")
|
| 817 |
+
language: Vocal language code (e.g., "en", "zh")
|
| 818 |
+
timesignature: Time signature (e.g., "4")
|
| 819 |
+
instrumental: Whether this is an instrumental piece
|
| 820 |
+
|
| 821 |
+
# Status
|
| 822 |
+
status_message: Status message from sample creation
|
| 823 |
+
success: Whether sample creation completed successfully
|
| 824 |
+
error: Error message if sample creation failed
|
| 825 |
+
"""
|
| 826 |
+
# Metadata Fields
|
| 827 |
+
caption: str = ""
|
| 828 |
+
lyrics: str = ""
|
| 829 |
+
bpm: Optional[int] = None
|
| 830 |
+
duration: Optional[float] = None
|
| 831 |
+
keyscale: str = ""
|
| 832 |
+
language: str = ""
|
| 833 |
+
timesignature: str = ""
|
| 834 |
+
instrumental: bool = False
|
| 835 |
+
|
| 836 |
+
# Status
|
| 837 |
+
status_message: str = ""
|
| 838 |
+
success: bool = True
|
| 839 |
+
error: Optional[str] = None
|
| 840 |
+
|
| 841 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 842 |
+
"""Convert result to dictionary for JSON serialization."""
|
| 843 |
+
return asdict(self)
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
def create_sample(
|
| 847 |
+
llm_handler,
|
| 848 |
+
query: str,
|
| 849 |
+
instrumental: bool = False,
|
| 850 |
+
vocal_language: Optional[List[str]] = None,
|
| 851 |
+
temperature: float = 0.85,
|
| 852 |
+
top_k: Optional[int] = None,
|
| 853 |
+
top_p: Optional[float] = None,
|
| 854 |
+
repetition_penalty: float = 1.0,
|
| 855 |
+
use_constrained_decoding: bool = True,
|
| 856 |
+
constrained_decoding_debug: bool = False,
|
| 857 |
+
) -> CreateSampleResult:
|
| 858 |
+
"""Create a music sample from a natural language query using the 5Hz Language Model.
|
| 859 |
+
|
| 860 |
+
This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
|
| 861 |
+
language description of music and generates a complete sample including:
|
| 862 |
+
- Detailed caption/description
|
| 863 |
+
- Lyrics (unless instrumental)
|
| 864 |
+
- Metadata (BPM, duration, key, language, time signature)
|
| 865 |
+
|
| 866 |
+
Note: cfg_scale and negative_prompt are not supported in create_sample mode.
|
| 867 |
+
|
| 868 |
+
Args:
|
| 869 |
+
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 870 |
+
query: User's natural language music description (e.g., "a soft Bengali love song")
|
| 871 |
+
instrumental: Whether to generate instrumental music (no vocals)
|
| 872 |
+
vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
|
| 873 |
+
If provided, the model will be constrained to generate lyrics in these languages.
|
| 874 |
+
If None or ["unknown"], no language constraint is applied.
|
| 875 |
+
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 876 |
+
top_k: Top-K sampling (None or 0 = disabled)
|
| 877 |
+
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 878 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 879 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 880 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 881 |
+
|
| 882 |
+
Returns:
|
| 883 |
+
CreateSampleResult with generated sample fields and status
|
| 884 |
+
|
| 885 |
+
Example:
|
| 886 |
+
>>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
|
| 887 |
+
>>> if result.success:
|
| 888 |
+
... print(f"Caption: {result.caption}")
|
| 889 |
+
... print(f"Lyrics: {result.lyrics}")
|
| 890 |
+
... print(f"BPM: {result.bpm}")
|
| 891 |
+
"""
|
| 892 |
+
# Check if LLM is initialized
|
| 893 |
+
if not llm_handler.llm_initialized:
|
| 894 |
+
return CreateSampleResult(
|
| 895 |
+
status_message="5Hz LM not initialized. Please initialize it first.",
|
| 896 |
+
success=False,
|
| 897 |
+
error="LLM not initialized",
|
| 898 |
+
)
|
| 899 |
+
|
| 900 |
+
# Validate query
|
| 901 |
+
if not query or not query.strip():
|
| 902 |
+
return CreateSampleResult(
|
| 903 |
+
status_message="No query provided. Please enter a music description.",
|
| 904 |
+
success=False,
|
| 905 |
+
error="Empty query",
|
| 906 |
+
)
|
| 907 |
+
|
| 908 |
+
try:
|
| 909 |
+
# Call LLM to create sample
|
| 910 |
+
metadata, status = llm_handler.create_sample_from_query(
|
| 911 |
+
query=query,
|
| 912 |
+
instrumental=instrumental,
|
| 913 |
+
vocal_language=vocal_language,
|
| 914 |
+
temperature=temperature,
|
| 915 |
+
top_k=top_k,
|
| 916 |
+
top_p=top_p,
|
| 917 |
+
repetition_penalty=repetition_penalty,
|
| 918 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 919 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 920 |
+
)
|
| 921 |
+
|
| 922 |
+
# Check if LLM returned empty metadata (error case)
|
| 923 |
+
if not metadata:
|
| 924 |
+
return CreateSampleResult(
|
| 925 |
+
status_message=status or "Failed to create sample",
|
| 926 |
+
success=False,
|
| 927 |
+
error=status or "Empty metadata returned",
|
| 928 |
+
)
|
| 929 |
+
|
| 930 |
+
# Extract and convert fields
|
| 931 |
+
caption = metadata.get('caption', '')
|
| 932 |
+
lyrics = metadata.get('lyrics', '')
|
| 933 |
+
keyscale = metadata.get('keyscale', '')
|
| 934 |
+
language = metadata.get('language', metadata.get('vocal_language', ''))
|
| 935 |
+
timesignature = metadata.get('timesignature', '')
|
| 936 |
+
is_instrumental = metadata.get('instrumental', instrumental)
|
| 937 |
+
|
| 938 |
+
# Convert BPM to int
|
| 939 |
+
bpm = None
|
| 940 |
+
bpm_value = metadata.get('bpm')
|
| 941 |
+
if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
|
| 942 |
+
try:
|
| 943 |
+
bpm = int(bpm_value)
|
| 944 |
+
except (ValueError, TypeError):
|
| 945 |
+
pass
|
| 946 |
+
|
| 947 |
+
# Convert duration to float
|
| 948 |
+
duration = None
|
| 949 |
+
duration_value = metadata.get('duration')
|
| 950 |
+
if duration_value is not None and duration_value != 'N/A' and duration_value != '':
|
| 951 |
+
try:
|
| 952 |
+
duration = float(duration_value)
|
| 953 |
+
except (ValueError, TypeError):
|
| 954 |
+
pass
|
| 955 |
+
|
| 956 |
+
# Clean up N/A values
|
| 957 |
+
if keyscale == 'N/A':
|
| 958 |
+
keyscale = ''
|
| 959 |
+
if language == 'N/A':
|
| 960 |
+
language = ''
|
| 961 |
+
if timesignature == 'N/A':
|
| 962 |
+
timesignature = ''
|
| 963 |
+
|
| 964 |
+
return CreateSampleResult(
|
| 965 |
+
caption=caption,
|
| 966 |
+
lyrics=lyrics,
|
| 967 |
+
bpm=bpm,
|
| 968 |
+
duration=duration,
|
| 969 |
+
keyscale=keyscale,
|
| 970 |
+
language=language,
|
| 971 |
+
timesignature=timesignature,
|
| 972 |
+
instrumental=is_instrumental,
|
| 973 |
+
status_message=status,
|
| 974 |
+
success=True,
|
| 975 |
+
error=None,
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
except Exception as e:
|
| 979 |
+
logger.exception("Sample creation failed")
|
| 980 |
+
return CreateSampleResult(
|
| 981 |
+
status_message=f"Error: {str(e)}",
|
| 982 |
+
success=False,
|
| 983 |
+
error=str(e),
|
| 984 |
+
)
|
acestep/llm_inference.py
CHANGED
|
@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
-
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
@@ -308,7 +308,7 @@ class LLMHandler:
|
|
| 308 |
if not os.path.exists(full_lm_model_path):
|
| 309 |
return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
|
| 310 |
|
| 311 |
-
logger.info("loading 5Hz LM tokenizer...")
|
| 312 |
start_time = time.time()
|
| 313 |
# TODO: load tokenizer too slow, not found solution yet
|
| 314 |
llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
|
|
@@ -1433,6 +1433,185 @@ class LLMHandler:
|
|
| 1433 |
|
| 1434 |
return after_think.strip()
|
| 1435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1436 |
def generate_from_formatted_prompt(
|
| 1437 |
self,
|
| 1438 |
formatted_prompt: str,
|
|
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
+
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
|
|
| 308 |
if not os.path.exists(full_lm_model_path):
|
| 309 |
return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
|
| 310 |
|
| 311 |
+
logger.info("loading 5Hz LM tokenizer... it may take 80~90s")
|
| 312 |
start_time = time.time()
|
| 313 |
# TODO: load tokenizer too slow, not found solution yet
|
| 314 |
llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
|
|
|
|
| 1433 |
|
| 1434 |
return after_think.strip()
|
| 1435 |
|
| 1436 |
+
def build_formatted_prompt_for_inspiration(
|
| 1437 |
+
self,
|
| 1438 |
+
query: str,
|
| 1439 |
+
instrumental: bool = False,
|
| 1440 |
+
is_negative_prompt: bool = False,
|
| 1441 |
+
negative_prompt: str = "NO USER INPUT"
|
| 1442 |
+
) -> str:
|
| 1443 |
+
"""
|
| 1444 |
+
Build the chat-formatted prompt for inspiration/simple mode.
|
| 1445 |
+
|
| 1446 |
+
This generates a complete sample (caption, lyrics, metadata) from a user's
|
| 1447 |
+
natural language music description query.
|
| 1448 |
+
|
| 1449 |
+
Args:
|
| 1450 |
+
query: User's natural language music description
|
| 1451 |
+
instrumental: Whether to generate instrumental music (no vocals)
|
| 1452 |
+
is_negative_prompt: If True, builds unconditional prompt for CFG
|
| 1453 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 1454 |
+
|
| 1455 |
+
Returns:
|
| 1456 |
+
Formatted prompt string
|
| 1457 |
+
|
| 1458 |
+
Example:
|
| 1459 |
+
query = "a soft Bengali love song for a quiet evening"
|
| 1460 |
+
prompt = handler.build_formatted_prompt_for_inspiration(query, instrumental=False)
|
| 1461 |
+
"""
|
| 1462 |
+
if self.llm_tokenizer is None:
|
| 1463 |
+
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 1464 |
+
|
| 1465 |
+
# Build user content with query and instrumental flag
|
| 1466 |
+
instrumental_str = "true" if instrumental else "false"
|
| 1467 |
+
|
| 1468 |
+
if is_negative_prompt:
|
| 1469 |
+
# For CFG unconditional prompt
|
| 1470 |
+
user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
|
| 1471 |
+
else:
|
| 1472 |
+
# Normal prompt: query + instrumental flag
|
| 1473 |
+
user_content = f"{query}\n\ninstrumental: {instrumental_str}"
|
| 1474 |
+
|
| 1475 |
+
return self.llm_tokenizer.apply_chat_template(
|
| 1476 |
+
[
|
| 1477 |
+
{
|
| 1478 |
+
"role": "system",
|
| 1479 |
+
"content": f"# Instruction\n{DEFAULT_LM_INSPIRED_INSTRUCTION}\n\n"
|
| 1480 |
+
},
|
| 1481 |
+
{
|
| 1482 |
+
"role": "user",
|
| 1483 |
+
"content": user_content
|
| 1484 |
+
},
|
| 1485 |
+
],
|
| 1486 |
+
tokenize=False,
|
| 1487 |
+
add_generation_prompt=True,
|
| 1488 |
+
)
|
| 1489 |
+
|
| 1490 |
+
def create_sample_from_query(
|
| 1491 |
+
self,
|
| 1492 |
+
query: str,
|
| 1493 |
+
instrumental: bool = False,
|
| 1494 |
+
vocal_language: Optional[List[str]] = None,
|
| 1495 |
+
temperature: float = 0.85,
|
| 1496 |
+
top_k: Optional[int] = None,
|
| 1497 |
+
top_p: Optional[float] = None,
|
| 1498 |
+
repetition_penalty: float = 1.0,
|
| 1499 |
+
use_constrained_decoding: bool = True,
|
| 1500 |
+
constrained_decoding_debug: bool = False,
|
| 1501 |
+
) -> Tuple[Dict[str, Any], str]:
|
| 1502 |
+
"""
|
| 1503 |
+
Create a complete music sample from a user's natural language query.
|
| 1504 |
+
|
| 1505 |
+
This is the "Simple Mode" / "Inspiration Mode" feature that generates:
|
| 1506 |
+
- Metadata (bpm, caption, duration, keyscale, language, timesignature)
|
| 1507 |
+
- Lyrics (unless instrumental=True)
|
| 1508 |
+
|
| 1509 |
+
Args:
|
| 1510 |
+
query: User's natural language music description
|
| 1511 |
+
instrumental: Whether to generate instrumental music (no vocals)
|
| 1512 |
+
vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
|
| 1513 |
+
If provided and not ["unknown"], the first language will be used.
|
| 1514 |
+
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1515 |
+
top_k: Top-K sampling (None = disabled)
|
| 1516 |
+
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1517 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 1518 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 1519 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 1520 |
+
|
| 1521 |
+
Returns:
|
| 1522 |
+
Tuple of (metadata_dict, status_message)
|
| 1523 |
+
metadata_dict contains:
|
| 1524 |
+
- bpm: int or str
|
| 1525 |
+
- caption: str
|
| 1526 |
+
- duration: int or str
|
| 1527 |
+
- keyscale: str
|
| 1528 |
+
- language: str
|
| 1529 |
+
- timesignature: str
|
| 1530 |
+
- lyrics: str (extracted from output after </think>)
|
| 1531 |
+
- instrumental: bool (echoed back)
|
| 1532 |
+
|
| 1533 |
+
Example:
|
| 1534 |
+
query = "a soft Bengali love song for a quiet evening"
|
| 1535 |
+
metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
|
| 1536 |
+
print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
|
| 1537 |
+
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 1538 |
+
"""
|
| 1539 |
+
if not getattr(self, "llm_initialized", False):
|
| 1540 |
+
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1541 |
+
|
| 1542 |
+
if not query or not query.strip():
|
| 1543 |
+
return {}, "❌ No query provided. Please enter a music description."
|
| 1544 |
+
|
| 1545 |
+
logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
|
| 1546 |
+
|
| 1547 |
+
# Build formatted prompt for inspiration
|
| 1548 |
+
formatted_prompt = self.build_formatted_prompt_for_inspiration(
|
| 1549 |
+
query=query,
|
| 1550 |
+
instrumental=instrumental,
|
| 1551 |
+
)
|
| 1552 |
+
logger.debug(f"Formatted prompt for inspiration: {formatted_prompt}")
|
| 1553 |
+
|
| 1554 |
+
# Build user_metadata if vocal_language is specified and is not "unknown"
|
| 1555 |
+
user_metadata = None
|
| 1556 |
+
skip_language = False
|
| 1557 |
+
if vocal_language and len(vocal_language) > 0:
|
| 1558 |
+
# Filter out "unknown" from the list
|
| 1559 |
+
valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
|
| 1560 |
+
if valid_languages:
|
| 1561 |
+
# Use the first valid language for constrained decoding
|
| 1562 |
+
user_metadata = {"language": valid_languages[0]}
|
| 1563 |
+
skip_language = True # Skip language generation since we're injecting it
|
| 1564 |
+
logger.info(f"Using user-specified language: {valid_languages[0]}")
|
| 1565 |
+
|
| 1566 |
+
# Generate using constrained decoding (inspiration phase)
|
| 1567 |
+
# Similar to understand mode - generate metadata first (CoT), then lyrics
|
| 1568 |
+
# Note: cfg_scale and negative_prompt are not used in create_sample mode
|
| 1569 |
+
output_text, status = self.generate_from_formatted_prompt(
|
| 1570 |
+
formatted_prompt=formatted_prompt,
|
| 1571 |
+
cfg={
|
| 1572 |
+
"temperature": temperature,
|
| 1573 |
+
"top_k": top_k,
|
| 1574 |
+
"top_p": top_p,
|
| 1575 |
+
"repetition_penalty": repetition_penalty,
|
| 1576 |
+
"target_duration": None, # No duration constraint
|
| 1577 |
+
"user_metadata": user_metadata, # Inject language if specified
|
| 1578 |
+
"skip_caption": False, # Generate caption
|
| 1579 |
+
"skip_language": skip_language, # Skip if we're injecting language
|
| 1580 |
+
"skip_genres": False, # Generate genres
|
| 1581 |
+
"generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
|
| 1582 |
+
"caption": "",
|
| 1583 |
+
"lyrics": "",
|
| 1584 |
+
},
|
| 1585 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 1586 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 1587 |
+
stop_at_reasoning=False, # Continue after </think> to generate lyrics
|
| 1588 |
+
)
|
| 1589 |
+
|
| 1590 |
+
if not output_text:
|
| 1591 |
+
return {}, status
|
| 1592 |
+
|
| 1593 |
+
# Parse metadata and extract lyrics
|
| 1594 |
+
metadata, _ = self.parse_lm_output(output_text)
|
| 1595 |
+
|
| 1596 |
+
# Extract lyrics section (everything after </think>)
|
| 1597 |
+
lyrics = self._extract_lyrics_from_output(output_text)
|
| 1598 |
+
if lyrics:
|
| 1599 |
+
metadata['lyrics'] = lyrics
|
| 1600 |
+
elif instrumental:
|
| 1601 |
+
# For instrumental, set empty lyrics or placeholder
|
| 1602 |
+
metadata['lyrics'] = "[Instrumental]"
|
| 1603 |
+
|
| 1604 |
+
# Echo back the instrumental flag
|
| 1605 |
+
metadata['instrumental'] = instrumental
|
| 1606 |
+
|
| 1607 |
+
logger.info(f"Sample created successfully. Generated {len(metadata)} fields")
|
| 1608 |
+
if constrained_decoding_debug:
|
| 1609 |
+
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 1610 |
+
logger.debug(f"Output text preview: {output_text[:300]}...")
|
| 1611 |
+
|
| 1612 |
+
status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 1613 |
+
return metadata, status_msg
|
| 1614 |
+
|
| 1615 |
def generate_from_formatted_prompt(
|
| 1616 |
self,
|
| 1617 |
formatted_prompt: str,
|
examples/simple_mode/example_01.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "a soft Bengali love song for a quiet evening",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["bn"]
|
| 5 |
+
}
|
examples/simple_mode/example_02.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "an upbeat summer pop song with catchy hooks",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["en"]
|
| 5 |
+
}
|
examples/simple_mode/example_03.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "epic orchestral cinematic music for a movie trailer",
|
| 3 |
+
"instrumental": true,
|
| 4 |
+
"vocal_language": ["unknown"]
|
| 5 |
+
}
|
examples/simple_mode/example_04.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["zh"]
|
| 5 |
+
}
|
examples/simple_mode/example_05.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "Japanese city pop with nostalgic 80s vibes",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["ja"]
|
| 5 |
+
}
|
examples/simple_mode/example_06.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "lo-fi hip hop beats for studying and relaxing",
|
| 3 |
+
"instrumental": true,
|
| 4 |
+
"vocal_language": ["unknown"]
|
| 5 |
+
}
|
examples/simple_mode/example_07.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "energetic K-pop dance track with powerful vocals",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["ko"]
|
| 5 |
+
}
|
examples/simple_mode/example_08.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "romantic Spanish guitar ballad with heartfelt lyrics",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["es"]
|
| 5 |
+
}
|
examples/simple_mode/example_09.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "中国风电子舞曲,融合古典乐器与现代节拍",
|
| 3 |
+
"instrumental": false,
|
| 4 |
+
"vocal_language": ["zh"]
|
| 5 |
+
}
|
examples/simple_mode/example_10.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "peaceful piano melody for meditation and relaxation",
|
| 3 |
+
"instrumental": true,
|
| 4 |
+
"vocal_language": ["unknown"]
|
| 5 |
+
}
|