ChuxiJ commited on
Commit
c9570f3
·
1 Parent(s): 748bd62

fix lrc bugs

Browse files
acestep/gradio_ui/events/__init__.py CHANGED
@@ -92,31 +92,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
92
  ]
93
  )
94
 
95
- # Update codes hints visibility
96
- for trigger in [generation_section["src_audio"], generation_section["allow_lm_batch"], generation_section["batch_size_input"]]:
97
- trigger.change(
98
- fn=gen_h.update_codes_hints_visibility,
99
- inputs=[
100
- generation_section["src_audio"],
101
- generation_section["allow_lm_batch"],
102
- generation_section["batch_size_input"]
103
- ],
104
- outputs=[
105
- generation_section["codes_single_row"],
106
- generation_section["codes_batch_row"],
107
- generation_section["codes_batch_row_2"],
108
- generation_section["codes_col_1"],
109
- generation_section["codes_col_2"],
110
- generation_section["codes_col_3"],
111
- generation_section["codes_col_4"],
112
- generation_section["codes_col_5"],
113
- generation_section["codes_col_6"],
114
- generation_section["codes_col_7"],
115
- generation_section["codes_col_8"],
116
- generation_section["transcribe_btn"],
117
- ]
118
- )
119
-
120
  # ========== Audio Conversion ==========
121
  generation_section["convert_src_to_codes_btn"].click(
122
  fn=lambda src: gen_h.convert_src_audio_to_codes_wrapper(dit_handler, src),
@@ -397,7 +372,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
397
  ],
398
  outputs=[
399
  results_section[f"lrc_display_{btn_idx}"],
400
- results_section[f"details_accordion_{btn_idx}"]
 
 
401
  ]
402
  )
403
 
@@ -445,6 +422,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
445
  generation_section["constrained_decoding_debug"],
446
  generation_section["allow_lm_batch"],
447
  generation_section["auto_score"],
 
448
  generation_section["score_scale"],
449
  generation_section["lm_batch_chunk_size"],
450
  generation_section["track_name"],
@@ -476,15 +454,30 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
476
  results_section["score_display_6"],
477
  results_section["score_display_7"],
478
  results_section["score_display_8"],
479
- generation_section["text2music_audio_code_string"],
480
- generation_section["text2music_audio_code_string_1"],
481
- generation_section["text2music_audio_code_string_2"],
482
- generation_section["text2music_audio_code_string_3"],
483
- generation_section["text2music_audio_code_string_4"],
484
- generation_section["text2music_audio_code_string_5"],
485
- generation_section["text2music_audio_code_string_6"],
486
- generation_section["text2music_audio_code_string_7"],
487
- generation_section["text2music_audio_code_string_8"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  results_section["lm_metadata_state"],
489
  results_section["is_format_caption_state"],
490
  results_section["current_batch_index"],
@@ -546,6 +539,30 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
546
  results_section["score_display_6"],
547
  results_section["score_display_7"],
548
  results_section["score_display_8"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  results_section["restore_params_btn"],
550
  ]
551
  )
@@ -590,6 +607,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
590
  generation_section["constrained_decoding_debug"],
591
  generation_section["allow_lm_batch"],
592
  generation_section["auto_score"],
 
593
  generation_section["score_scale"],
594
  generation_section["lm_batch_chunk_size"],
595
  generation_section["track_name"],
@@ -629,6 +647,30 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
629
  results_section["score_display_6"],
630
  results_section["score_display_7"],
631
  results_section["score_display_8"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  results_section["restore_params_btn"],
633
  ]
634
  ).then(
@@ -658,14 +700,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
658
  ],
659
  outputs=[
660
  generation_section["text2music_audio_code_string"],
661
- generation_section["text2music_audio_code_string_1"],
662
- generation_section["text2music_audio_code_string_2"],
663
- generation_section["text2music_audio_code_string_3"],
664
- generation_section["text2music_audio_code_string_4"],
665
- generation_section["text2music_audio_code_string_5"],
666
- generation_section["text2music_audio_code_string_6"],
667
- generation_section["text2music_audio_code_string_7"],
668
- generation_section["text2music_audio_code_string_8"],
669
  generation_section["captions"],
670
  generation_section["lyrics"],
671
  generation_section["bpm"],
@@ -687,3 +721,16 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
687
  generation_section["complete_track_classes"],
688
  ]
689
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  ]
93
  )
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # ========== Audio Conversion ==========
96
  generation_section["convert_src_to_codes_btn"].click(
97
  fn=lambda src: gen_h.convert_src_audio_to_codes_wrapper(dit_handler, src),
 
372
  ],
373
  outputs=[
374
  results_section[f"lrc_display_{btn_idx}"],
375
+ results_section[f"details_accordion_{btn_idx}"],
376
+ # Audio subtitles now auto-updated via lrc_display.change()
377
+ results_section["batch_queue"]
378
  ]
379
  )
380
 
 
422
  generation_section["constrained_decoding_debug"],
423
  generation_section["allow_lm_batch"],
424
  generation_section["auto_score"],
425
+ generation_section["auto_lrc"],
426
  generation_section["score_scale"],
427
  generation_section["lm_batch_chunk_size"],
428
  generation_section["track_name"],
 
454
  results_section["score_display_6"],
455
  results_section["score_display_7"],
456
  results_section["score_display_8"],
457
+ results_section["codes_display_1"],
458
+ results_section["codes_display_2"],
459
+ results_section["codes_display_3"],
460
+ results_section["codes_display_4"],
461
+ results_section["codes_display_5"],
462
+ results_section["codes_display_6"],
463
+ results_section["codes_display_7"],
464
+ results_section["codes_display_8"],
465
+ results_section["details_accordion_1"],
466
+ results_section["details_accordion_2"],
467
+ results_section["details_accordion_3"],
468
+ results_section["details_accordion_4"],
469
+ results_section["details_accordion_5"],
470
+ results_section["details_accordion_6"],
471
+ results_section["details_accordion_7"],
472
+ results_section["details_accordion_8"],
473
+ results_section["lrc_display_1"],
474
+ results_section["lrc_display_2"],
475
+ results_section["lrc_display_3"],
476
+ results_section["lrc_display_4"],
477
+ results_section["lrc_display_5"],
478
+ results_section["lrc_display_6"],
479
+ results_section["lrc_display_7"],
480
+ results_section["lrc_display_8"],
481
  results_section["lm_metadata_state"],
482
  results_section["is_format_caption_state"],
483
  results_section["current_batch_index"],
 
539
  results_section["score_display_6"],
540
  results_section["score_display_7"],
541
  results_section["score_display_8"],
542
+ results_section["codes_display_1"],
543
+ results_section["codes_display_2"],
544
+ results_section["codes_display_3"],
545
+ results_section["codes_display_4"],
546
+ results_section["codes_display_5"],
547
+ results_section["codes_display_6"],
548
+ results_section["codes_display_7"],
549
+ results_section["codes_display_8"],
550
+ results_section["lrc_display_1"],
551
+ results_section["lrc_display_2"],
552
+ results_section["lrc_display_3"],
553
+ results_section["lrc_display_4"],
554
+ results_section["lrc_display_5"],
555
+ results_section["lrc_display_6"],
556
+ results_section["lrc_display_7"],
557
+ results_section["lrc_display_8"],
558
+ results_section["details_accordion_1"],
559
+ results_section["details_accordion_2"],
560
+ results_section["details_accordion_3"],
561
+ results_section["details_accordion_4"],
562
+ results_section["details_accordion_5"],
563
+ results_section["details_accordion_6"],
564
+ results_section["details_accordion_7"],
565
+ results_section["details_accordion_8"],
566
  results_section["restore_params_btn"],
567
  ]
568
  )
 
607
  generation_section["constrained_decoding_debug"],
608
  generation_section["allow_lm_batch"],
609
  generation_section["auto_score"],
610
+ generation_section["auto_lrc"],
611
  generation_section["score_scale"],
612
  generation_section["lm_batch_chunk_size"],
613
  generation_section["track_name"],
 
647
  results_section["score_display_6"],
648
  results_section["score_display_7"],
649
  results_section["score_display_8"],
650
+ results_section["codes_display_1"],
651
+ results_section["codes_display_2"],
652
+ results_section["codes_display_3"],
653
+ results_section["codes_display_4"],
654
+ results_section["codes_display_5"],
655
+ results_section["codes_display_6"],
656
+ results_section["codes_display_7"],
657
+ results_section["codes_display_8"],
658
+ results_section["lrc_display_1"],
659
+ results_section["lrc_display_2"],
660
+ results_section["lrc_display_3"],
661
+ results_section["lrc_display_4"],
662
+ results_section["lrc_display_5"],
663
+ results_section["lrc_display_6"],
664
+ results_section["lrc_display_7"],
665
+ results_section["lrc_display_8"],
666
+ results_section["details_accordion_1"],
667
+ results_section["details_accordion_2"],
668
+ results_section["details_accordion_3"],
669
+ results_section["details_accordion_4"],
670
+ results_section["details_accordion_5"],
671
+ results_section["details_accordion_6"],
672
+ results_section["details_accordion_7"],
673
+ results_section["details_accordion_8"],
674
  results_section["restore_params_btn"],
675
  ]
676
  ).then(
 
700
  ],
701
  outputs=[
702
  generation_section["text2music_audio_code_string"],
 
 
 
 
 
 
 
 
703
  generation_section["captions"],
704
  generation_section["lyrics"],
705
  generation_section["bpm"],
 
721
  generation_section["complete_track_classes"],
722
  ]
723
  )
724
+
725
+ # ========== LRC Display Change Handlers ==========
726
+ # When lrc_display textbox changes, update the corresponding audio component's subtitles
727
+ for i in range(1, 9):
728
+ results_section[f"lrc_display_{i}"].change(
729
+ fn=res_h.update_audio_subtitles_from_lrc,
730
+ inputs=[
731
+ results_section[f"lrc_display_{i}"],
732
+ results_section[f"generated_audio_{i}"],
733
+ generation_section["audio_duration"],
734
+ ],
735
+ outputs=[results_section[f"generated_audio_{i}"]]
736
+ )
acestep/gradio_ui/events/generation_handlers.py CHANGED
@@ -568,60 +568,3 @@ def update_audio_components_visibility(batch_size):
568
  return updates_row1 + updates_row2
569
 
570
 
571
- def update_codes_hints_visibility(src_audio, allow_lm_batch, batch_size):
572
- """Switch between single/batch codes input based on src_audio presence
573
-
574
- When src_audio is present:
575
- - Show single mode with transcribe button
576
- - Clear codes (will be filled by transcription)
577
-
578
- When src_audio is absent:
579
- - Hide transcribe button
580
- - Show batch mode if allow_lm_batch=True and batch_size>=2
581
- - Show single mode otherwise
582
-
583
- Row 1: Codes 1-4
584
- Row 2: Codes 5-8 (batch_size >= 5)
585
- """
586
- batch_size = min(max(int(batch_size), 1), 8)
587
- has_src_audio = src_audio is not None
588
-
589
- if has_src_audio:
590
- # Has src_audio: show single mode with transcribe button
591
- return (
592
- gr.update(visible=True), # codes_single_row
593
- gr.update(visible=False), # codes_batch_row
594
- gr.update(visible=False), # codes_batch_row_2
595
- *[gr.update(visible=False)] * 8, # Hide all batch columns
596
- gr.update(visible=True), # transcribe_btn: show when src_audio present
597
- )
598
- else:
599
- # No src_audio: decide between single/batch mode based on settings
600
- if allow_lm_batch and batch_size >= 2:
601
- # Batch mode: hide single, show batch codes with dynamic columns
602
- show_row_2 = batch_size >= 5
603
- return (
604
- gr.update(visible=False), # codes_single_row
605
- gr.update(visible=True), # codes_batch_row (row 1)
606
- gr.update(visible=show_row_2), # codes_batch_row_2 (row 2)
607
- # Row 1 columns (1-4)
608
- gr.update(visible=True), # codes_col_1: always visible in batch mode
609
- gr.update(visible=batch_size >= 2), # codes_col_2
610
- gr.update(visible=batch_size >= 3), # codes_col_3
611
- gr.update(visible=batch_size >= 4), # codes_col_4
612
- # Row 2 columns (5-8)
613
- gr.update(visible=batch_size >= 5), # codes_col_5
614
- gr.update(visible=batch_size >= 6), # codes_col_6
615
- gr.update(visible=batch_size >= 7), # codes_col_7
616
- gr.update(visible=batch_size >= 8), # codes_col_8
617
- gr.update(visible=False), # transcribe_btn: hide when no src_audio
618
- )
619
- else:
620
- # Single mode: show single, hide batch
621
- return (
622
- gr.update(visible=True), # codes_single_row
623
- gr.update(visible=False), # codes_batch_row
624
- gr.update(visible=False), # codes_batch_row_2
625
- *[gr.update(visible=False)] * 8, # Hide all batch columns
626
- gr.update(visible=False), # transcribe_btn: hide when no src_audio
627
- )
 
568
  return updates_row1 + updates_row2
569
 
570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acestep/gradio_ui/events/results_handlers.py CHANGED
@@ -6,11 +6,12 @@ import os
6
  import json
7
  import datetime
8
  import math
 
9
  import tempfile
10
  import shutil
11
  import zipfile
12
  import time as time_module
13
- from typing import Dict, Any, Optional
14
  import gradio as gr
15
  from loguru import logger
16
  from acestep.gradio_ui.i18n import t
@@ -18,6 +19,88 @@ from acestep.inference import generate_music, GenerationParams, GenerationConfig
18
  from acestep.audio_utils import save_audio
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def _build_generation_info(
22
  lm_metadata: Optional[Dict[str, Any]],
23
  time_costs: Dict[str, float],
@@ -99,13 +182,16 @@ def _build_generation_info(
99
  # Post-processing time costs
100
  audio_conversion_time = time_costs.get('audio_conversion_time', 0.0)
101
  auto_score_time = time_costs.get('auto_score_time', 0.0)
 
102
 
103
- if audio_conversion_time > 0 or auto_score_time > 0:
104
  time_lines.append("\n**🔧 Post-processing Time:**")
105
  if audio_conversion_time > 0:
106
  time_lines.append(f" - Audio Conversion: {audio_conversion_time:.2f}s")
107
  if auto_score_time > 0:
108
  time_lines.append(f" - Auto Score: {auto_score_time:.2f}s")
 
 
109
 
110
  # Pipeline total
111
  pipeline_total = time_costs.get('pipeline_total_time', 0.0)
@@ -276,6 +362,7 @@ def generate_with_progress(
276
  constrained_decoding_debug,
277
  allow_lm_batch,
278
  auto_score,
 
279
  score_scale,
280
  lm_batch_chunk_size,
281
  progress=gr.Progress(track_tqdm=True),
@@ -357,6 +444,11 @@ def generate_with_progress(
357
  # Initialize post-processing timing
358
  audio_conversion_start_time = time_module.time()
359
  total_auto_score_time = 0.0
 
 
 
 
 
360
 
361
  updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
362
 
@@ -370,11 +462,52 @@ def generate_with_progress(
370
  )
371
 
372
  if not result.success:
373
- yield (None,) * 8 + (None, generation_info, result.status_message) + (gr.skip(),) * 20 + (None,) # +1 for extra_outputs
 
 
 
 
 
 
 
 
 
374
  return
375
 
376
  audios = result.audios
377
  progress(0.99, "Converting audio to mp3...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  for i in range(8):
379
  if i < len(audios):
380
  key = audios[i]["key"]
@@ -395,7 +528,7 @@ def generate_with_progress(
395
  code_str = audio_params.get("audio_codes", "")
396
  final_codes_list[i] = code_str
397
 
398
- scores_ui_updates = [gr.skip()] * 8
399
  score_str = "Done!"
400
  if auto_score:
401
  auto_score_start = time_module.time()
@@ -405,12 +538,82 @@ def generate_with_progress(
405
  scores_ui_updates[i] = score_str
406
  final_scores_list[i] = score_str
407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
409
- current_audio_updates = [gr.skip()] * 8
 
410
  current_audio_updates[i] = audio_path
411
 
412
- audio_codes_ui_updates = [gr.skip()] * 8
413
- audio_codes_ui_updates[i] = code_str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  yield (
415
  current_audio_updates[0], current_audio_updates[1], current_audio_updates[2], current_audio_updates[3],
416
  current_audio_updates[4], current_audio_updates[5], current_audio_updates[6], current_audio_updates[7],
@@ -420,13 +623,19 @@ def generate_with_progress(
420
  seed_value_for_ui,
421
  # Scores
422
  scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
423
- updated_audio_codes,
424
- # Codes
425
- audio_codes_ui_updates[0], audio_codes_ui_updates[1], audio_codes_ui_updates[2], audio_codes_ui_updates[3],
426
- audio_codes_ui_updates[4], audio_codes_ui_updates[5], audio_codes_ui_updates[6], audio_codes_ui_updates[7],
 
 
 
 
 
427
  lm_generated_metadata,
428
  is_format_caption,
429
  None, # Placeholder for extra_outputs (only filled in final yield)
 
430
  )
431
  else:
432
  # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
@@ -442,10 +651,12 @@ def generate_with_progress(
442
  time_costs['audio_conversion_time'] = audio_conversion_time
443
  if total_auto_score_time > 0:
444
  time_costs['auto_score_time'] = total_auto_score_time
 
 
445
 
446
  # Update pipeline total time to include post-processing
447
  if 'pipeline_total_time' in time_costs:
448
- time_costs['pipeline_total_time'] += audio_conversion_time + total_auto_score_time
449
 
450
  # Rebuild generation_info with complete timing information
451
  generation_info = _build_generation_info(
@@ -456,6 +667,23 @@ def generate_with_progress(
456
  num_audios=len(result.audios),
457
  )
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  yield (
460
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 1-4: SKIP
461
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 5-8: SKIP
@@ -465,12 +693,23 @@ def generate_with_progress(
465
  seed_value_for_ui,
466
  final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
467
  final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
468
- updated_audio_codes,
469
- final_codes_list[0], final_codes_list[1], final_codes_list[2], final_codes_list[3],
470
- final_codes_list[4], final_codes_list[5], final_codes_list[6], final_codes_list[7],
 
 
 
 
 
 
471
  lm_generated_metadata,
472
  is_format_caption,
473
- result.extra_outputs, # extra_outputs for LRC generation
 
 
 
 
 
474
  )
475
 
476
 
@@ -652,6 +891,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
652
 
653
  This function retrieves cached generation data from batch_queue and calls
654
  the handler's get_lyric_timestamp method to generate LRC format lyrics.
 
655
 
656
  Args:
657
  dit_handler: DiT handler instance with get_lyric_timestamp method
@@ -662,19 +902,19 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
662
  inference_steps: Number of inference steps used in generation
663
 
664
  Returns:
665
- LRC formatted string or error message
666
  """
667
  import torch
668
 
669
  if current_batch_index not in batch_queue:
670
- return gr.skip(), gr.skip()
671
 
672
  batch_data = batch_queue[current_batch_index]
673
  extra_outputs = batch_data.get("extra_outputs", {})
674
 
675
  # Check if required data is available
676
  if not extra_outputs:
677
- return gr.update(value=t("messages.lrc_no_extra_outputs"), visible=True), gr.update(visible=True)
678
 
679
  pred_latents = extra_outputs.get("pred_latents")
680
  encoder_hidden_states = extra_outputs.get("encoder_hidden_states")
@@ -683,7 +923,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
683
  lyric_token_idss = extra_outputs.get("lyric_token_idss")
684
 
685
  if any(x is None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
686
- return gr.update(value=t("messages.lrc_missing_tensors"), visible=True), gr.update(visible=True)
687
 
688
  # Adjust sample_idx to 0-based
689
  sample_idx_0based = sample_idx - 1
@@ -691,7 +931,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
691
  # Check if sample exists in batch
692
  batch_size = pred_latents.shape[0]
693
  if sample_idx_0based >= batch_size:
694
- return gr.update(value=t("messages.lrc_sample_not_exist"), visible=True), gr.update(visible=True)
695
 
696
  # Extract the specific sample's data
697
  try:
@@ -729,15 +969,72 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
729
  if result.get("success"):
730
  lrc_text = result.get("lrc_text", "")
731
  if not lrc_text:
732
- return gr.update(value=t("messages.lrc_empty_result"), visible=True), gr.update(visible=True)
733
- return gr.update(value=lrc_text, visible=True), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  else:
735
  error_msg = result.get("error", "Unknown error")
736
- return gr.update(value=f"❌ {error_msg}", visible=True), gr.update(visible=True)
737
 
738
  except Exception as e:
739
  logger.exception("[generate_lrc_handler] Error generating LRC")
740
- return gr.update(value=f"❌ Error: {str(e)}", visible=True), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
 
743
  def capture_current_params(
@@ -749,7 +1046,7 @@ def capture_current_params(
749
  use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
750
  think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
751
  use_cot_metas, use_cot_caption, use_cot_language,
752
- constrained_decoding_debug, allow_lm_batch, auto_score, score_scale, lm_batch_chunk_size,
753
  track_name, complete_track_classes
754
  ):
755
  """Capture current UI parameters for next batch generation
@@ -796,6 +1093,7 @@ def capture_current_params(
796
  "constrained_decoding_debug": constrained_decoding_debug,
797
  "allow_lm_batch": allow_lm_batch,
798
  "auto_score": auto_score,
 
799
  "score_scale": score_scale,
800
  "lm_batch_chunk_size": lm_batch_chunk_size,
801
  "track_name": track_name,
@@ -816,6 +1114,7 @@ def generate_with_batch_management(
816
  constrained_decoding_debug,
817
  allow_lm_batch,
818
  auto_score,
 
819
  score_scale,
820
  lm_batch_chunk_size,
821
  track_name,
@@ -844,6 +1143,7 @@ def generate_with_batch_management(
844
  constrained_decoding_debug,
845
  allow_lm_batch,
846
  auto_score,
 
847
  score_scale,
848
  lm_batch_chunk_size,
849
  progress
@@ -853,8 +1153,8 @@ def generate_with_batch_management(
853
  final_result_from_inner = partial_result
854
  # current_batch_index, total_batches, batch_queue, next_params,
855
  # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
856
- # Slice off extra_outputs (last item) before re-yielding to UI
857
- ui_result = partial_result[:-1] if len(partial_result) > 31 else partial_result
858
  yield ui_result + (
859
  gr.skip(), gr.skip(), gr.skip(), gr.skip(),
860
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
@@ -863,8 +1163,8 @@ def generate_with_batch_management(
863
  all_audio_paths = result[8]
864
 
865
  if all_audio_paths is None:
866
- # Slice off extra_outputs before yielding to UI
867
- ui_result = result[:-1] if len(result) > 31 else result
868
  yield ui_result + (
869
  gr.skip(), gr.skip(), gr.skip(), gr.skip(),
870
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
@@ -872,14 +1172,18 @@ def generate_with_batch_management(
872
  return
873
 
874
  # Extract results from generation (使用 result 下标访问)
875
- # New indices after removing 6 align_* items (was 12-17, now shifted down by 6)
 
 
 
876
  generation_info = result[9]
877
  seed_value_for_ui = result[11]
878
- lm_generated_metadata = result[29] # was 35, now 29
879
 
880
- # Extract codes
881
- generated_codes_single = result[20] # was 26, now 20
882
- generated_codes_batch = [result[21], result[22], result[23], result[24], result[25], result[26], result[27], result[28]] # was 27-34, now 21-28
 
883
 
884
  # Determine which codes to store based on mode
885
  if allow_lm_batch and batch_size_input >= 2:
@@ -926,6 +1230,7 @@ def generate_with_batch_management(
926
  "constrained_decoding_debug": constrained_decoding_debug,
927
  "allow_lm_batch": allow_lm_batch,
928
  "auto_score": auto_score,
 
929
  "score_scale": score_scale,
930
  "lm_batch_chunk_size": lm_batch_chunk_size,
931
  "track_name": track_name,
@@ -938,8 +1243,9 @@ def generate_with_batch_management(
938
  next_params["text2music_audio_code_string"] = ""
939
  next_params["random_seed_checkbox"] = True
940
 
941
- # Extract extra_outputs from result tuple (index 31)
942
- extra_outputs_from_result = result[31] if len(result) > 31 else {}
 
943
 
944
  # Store current batch in queue
945
  batch_queue = store_batch_in_queue(
@@ -957,6 +1263,13 @@ def generate_with_batch_management(
957
  status="completed"
958
  )
959
 
 
 
 
 
 
 
 
960
  # Update batch counters
961
  total_batches = max(total_batches, current_batch_index + 1)
962
 
@@ -973,8 +1286,14 @@ def generate_with_batch_management(
973
 
974
  # 4. Yield final result (includes Batch UI updates)
975
  # The result here is already a tuple structure
976
- # Slice off extra_outputs (last item) before yielding to UI - it's already stored in batch_queue
977
- ui_result = result[:-1] if len(result) > 31 else result
 
 
 
 
 
 
978
  yield ui_result + (
979
  current_batch_index,
980
  total_batches,
@@ -1086,6 +1405,7 @@ def generate_next_batch_background(
1086
  params.setdefault("constrained_decoding_debug", False)
1087
  params.setdefault("allow_lm_batch", True)
1088
  params.setdefault("auto_score", False)
 
1089
  params.setdefault("score_scale", 0.5)
1090
  params.setdefault("lm_batch_chunk_size", 8)
1091
  params.setdefault("track_name", None)
@@ -1134,6 +1454,7 @@ def generate_next_batch_background(
1134
  constrained_decoding_debug=params.get("constrained_decoding_debug"),
1135
  allow_lm_batch=params.get("allow_lm_batch"),
1136
  auto_score=params.get("auto_score"),
 
1137
  score_scale=params.get("score_scale"),
1138
  lm_batch_chunk_size=params.get("lm_batch_chunk_size"),
1139
  progress=progress
@@ -1145,15 +1466,22 @@ def generate_next_batch_background(
1145
  final_result = partial_result
1146
 
1147
  # Extract results from final_result
1148
- # Indices shifted by -6 after removing align_* items
 
 
 
1149
  all_audio_paths = final_result[8] # generated_audio_batch
1150
  generation_info = final_result[9]
1151
  seed_value_for_ui = final_result[11]
1152
- lm_generated_metadata = final_result[29] # was 35, now 29
 
 
 
 
 
1153
 
1154
- # Extract codes
1155
- generated_codes_single = final_result[20] # was 26, now 20
1156
- generated_codes_batch = [final_result[21], final_result[22], final_result[23], final_result[24], final_result[25], final_result[26], final_result[27], final_result[28]] # was 27-34, now 21-28
1157
 
1158
  # Determine which codes to store
1159
  batch_size = params.get("batch_size_input", 2)
@@ -1168,6 +1496,7 @@ def generate_next_batch_background(
1168
  logger.info(f" - allow_lm_batch: {allow_lm_batch}")
1169
  logger.info(f" - batch_size: {batch_size}")
1170
  logger.info(f" - generated_codes_single exists: {bool(generated_codes_single)}")
 
1171
  if isinstance(codes_to_store, list):
1172
  logger.info(f" - codes_to_store: LIST with {len(codes_to_store)} items")
1173
  for idx, code in enumerate(codes_to_store):
@@ -1176,7 +1505,6 @@ def generate_next_batch_background(
1176
  logger.info(f" - codes_to_store: STRING with {len(codes_to_store) if codes_to_store else 0} chars")
1177
 
1178
  # Store next batch in queue with codes, batch settings, and ALL generation params
1179
- # Note: extra_outputs not available for background batches (LRC not supported for auto-gen batches)
1180
  batch_queue = store_batch_in_queue(
1181
  batch_queue,
1182
  next_batch_idx,
@@ -1188,7 +1516,7 @@ def generate_next_batch_background(
1188
  batch_size=int(batch_size),
1189
  generation_params=params,
1190
  lm_generated_metadata=lm_generated_metadata,
1191
- extra_outputs=None, # Not available for background batches
1192
  status="completed"
1193
  )
1194
 
@@ -1229,7 +1557,7 @@ def navigate_to_previous_batch(current_batch_index, batch_queue):
1229
  """Navigate to previous batch (Result View Only - Never touches Input UI)"""
1230
  if current_batch_index <= 0:
1231
  gr.Warning(t("messages.at_first_batch"))
1232
- return [gr.update()] * 24
1233
 
1234
  # Move to previous batch
1235
  new_batch_index = current_batch_index - 1
@@ -1237,17 +1565,25 @@ def navigate_to_previous_batch(current_batch_index, batch_queue):
1237
  # Load batch data from queue
1238
  if new_batch_index not in batch_queue:
1239
  gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
1240
- return [gr.update()] * 24
1241
 
1242
  batch_data = batch_queue[new_batch_index]
1243
  audio_paths = batch_data.get("audio_paths", [])
1244
  generation_info_text = batch_data.get("generation_info", "")
1245
 
1246
- # Prepare audio outputs (up to 8)
1247
- audio_outputs = [None] * 8
1248
  real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
1249
- for idx in range(min(len(real_audio_paths), 8)):
1250
- audio_outputs[idx] = real_audio_paths[idx]
 
 
 
 
 
 
 
 
 
1251
 
1252
  # Update batch indicator
1253
  total_batches = len(batch_queue)
@@ -1260,14 +1596,52 @@ def navigate_to_previous_batch(current_batch_index, batch_queue):
1260
  stored_scores = batch_data.get("scores", [""] * 8)
1261
  score_displays = stored_scores if stored_scores else [""] * 8
1262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1263
  return (
1264
- audio_outputs[0], audio_outputs[1], audio_outputs[2], audio_outputs[3],
1265
- audio_outputs[4], audio_outputs[5], audio_outputs[6], audio_outputs[7],
1266
  audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
1267
  gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
1268
  t("messages.viewing_batch", n=new_batch_index + 1),
1269
  score_displays[0], score_displays[1], score_displays[2], score_displays[3],
1270
  score_displays[4], score_displays[5], score_displays[6], score_displays[7],
 
 
 
 
 
 
1271
  gr.update(interactive=True),
1272
  )
1273
 
@@ -1276,7 +1650,7 @@ def navigate_to_next_batch(autogen_enabled, current_batch_index, total_batches,
1276
  """Navigate to next batch (Result View Only - Never touches Input UI)"""
1277
  if current_batch_index >= total_batches - 1:
1278
  gr.Warning(t("messages.at_last_batch"))
1279
- return [gr.update()] * 25
1280
 
1281
  # Move to next batch
1282
  new_batch_index = current_batch_index + 1
@@ -1284,17 +1658,25 @@ def navigate_to_next_batch(autogen_enabled, current_batch_index, total_batches,
1284
  # Load batch data from queue
1285
  if new_batch_index not in batch_queue:
1286
  gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
1287
- return [gr.update()] * 25
1288
 
1289
  batch_data = batch_queue[new_batch_index]
1290
  audio_paths = batch_data.get("audio_paths", [])
1291
  generation_info_text = batch_data.get("generation_info", "")
1292
 
1293
- # Prepare audio outputs (up to 8)
1294
- audio_outputs = [None] * 8
1295
  real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
1296
- for idx in range(min(len(real_audio_paths), 8)):
1297
- audio_outputs[idx] = real_audio_paths[idx]
 
 
 
 
 
 
 
 
 
1298
 
1299
  # Update batch indicator
1300
  batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)
@@ -1312,14 +1694,52 @@ def navigate_to_next_batch(autogen_enabled, current_batch_index, total_batches,
1312
  stored_scores = batch_data.get("scores", [""] * 8)
1313
  score_displays = stored_scores if stored_scores else [""] * 8
1314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1315
  return (
1316
- audio_outputs[0], audio_outputs[1], audio_outputs[2], audio_outputs[3],
1317
- audio_outputs[4], audio_outputs[5], audio_outputs[6], audio_outputs[7],
1318
  audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
1319
  gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
1320
  t("messages.viewing_batch", n=new_batch_index + 1), next_batch_status_text,
1321
  score_displays[0], score_displays[1], score_displays[2], score_displays[3],
1322
  score_displays[4], score_displays[5], score_displays[6], score_displays[7],
 
 
 
 
 
 
1323
  gr.update(interactive=True),
1324
  )
1325
 
@@ -1331,7 +1751,7 @@ def restore_batch_parameters(current_batch_index, batch_queue):
1331
  """
1332
  if current_batch_index not in batch_queue:
1333
  gr.Warning(t("messages.no_batch_data"))
1334
- return [gr.update()] * 29
1335
 
1336
  batch_data = batch_queue[current_batch_index]
1337
  params = batch_data.get("generation_params", {})
@@ -1357,27 +1777,22 @@ def restore_batch_parameters(current_batch_index, batch_queue):
1357
  track_name = params.get("track_name", None)
1358
  complete_track_classes = params.get("complete_track_classes", [])
1359
 
1360
- # Extract and process codes
1361
  stored_codes = batch_data.get("codes", "")
1362
- stored_allow_lm_batch = params.get("allow_lm_batch", False)
1363
-
1364
- codes_outputs = [""] * 9 # [Main, 1-8]
1365
  if stored_codes:
1366
- if stored_allow_lm_batch and isinstance(stored_codes, list):
1367
- # Batch mode: populate codes 1-8, main shows first
1368
- codes_outputs[0] = stored_codes[0] if stored_codes else ""
1369
- for idx in range(min(len(stored_codes), 8)):
1370
- codes_outputs[idx + 1] = stored_codes[idx]
1371
  else:
1372
- # Single mode: populate main, clear 1-8
1373
- codes_outputs[0] = stored_codes if isinstance(stored_codes, str) else (stored_codes[0] if stored_codes else "")
 
 
1374
 
1375
  gr.Info(t("messages.params_restored", n=current_batch_index + 1))
1376
 
1377
  return (
1378
- codes_outputs[0], codes_outputs[1], codes_outputs[2], codes_outputs[3],
1379
- codes_outputs[4], codes_outputs[5], codes_outputs[6], codes_outputs[7],
1380
- codes_outputs[8], captions, lyrics, bpm, key_scale, time_signature,
1381
  vocal_language, audio_duration, batch_size_input, inference_steps,
1382
  lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, think_checkbox,
1383
  use_cot_caption, use_cot_language, allow_lm_batch,
 
6
  import json
7
  import datetime
8
  import math
9
+ import re
10
  import tempfile
11
  import shutil
12
  import zipfile
13
  import time as time_module
14
+ from typing import Dict, Any, Optional, List
15
  import gradio as gr
16
  from loguru import logger
17
  from acestep.gradio_ui.i18n import t
 
19
  from acestep.audio_utils import save_audio
20
 
21
 
22
+ def parse_lrc_to_subtitles(lrc_text: str, total_duration: Optional[float] = None) -> List[Dict[str, Any]]:
23
+ """
24
+ Parse LRC lyrics text to Gradio subtitles format.
25
+
26
+ LRC format: [MM:SS.ss]Lyric text or [MM:SS.ss][MM:SS.ss]Lyric text (with end time)
27
+ Gradio subtitles format: [{"text": str, "timestamp": [start, end]}]
28
+
29
+ Args:
30
+ lrc_text: LRC format lyrics string
31
+ total_duration: Total audio duration in seconds (used for last line's end time)
32
+
33
+ Returns:
34
+ List of subtitle dictionaries for Gradio Audio component
35
+ """
36
+ if not lrc_text or not lrc_text.strip():
37
+ return []
38
+
39
+ subtitles = []
40
+ lines = lrc_text.strip().split('\n')
41
+
42
+ # Regex patterns for LRC timestamps
43
+ # Pattern 1: [MM:SS.ss] - standard LRC with start time only
44
+ # Pattern 2: [MM:SS.ss][MM:SS.ss] - LRC with both start and end time
45
+ timestamp_pattern = r'\[(\d{2}):(\d{2})\.(\d{2})\]'
46
+
47
+ parsed_lines = []
48
+
49
+ for line in lines:
50
+ line = line.strip()
51
+ if not line:
52
+ continue
53
+
54
+ # Find all timestamps in the line
55
+ timestamps = re.findall(timestamp_pattern, line)
56
+ if not timestamps:
57
+ continue
58
+
59
+ # Remove timestamps from text to get the lyric content
60
+ text = re.sub(timestamp_pattern, '', line).strip()
61
+ if not text:
62
+ continue
63
+
64
+ # Parse first timestamp as start time
65
+ start_minutes, start_seconds, start_centiseconds = timestamps[0]
66
+ start_time = int(start_minutes) * 60 + int(start_seconds) + int(start_centiseconds) / 100.0
67
+
68
+ # If there's a second timestamp, use it as end time
69
+ end_time = None
70
+ if len(timestamps) >= 2:
71
+ end_minutes, end_seconds, end_centiseconds = timestamps[1]
72
+ end_time = int(end_minutes) * 60 + int(end_seconds) + int(end_centiseconds) / 100.0
73
+
74
+ parsed_lines.append({
75
+ 'start': start_time,
76
+ 'end': end_time,
77
+ 'text': text
78
+ })
79
+
80
+ # Sort by start time
81
+ parsed_lines.sort(key=lambda x: x['start'])
82
+
83
+ # Fill in missing end times using next line's start time
84
+ for i, line_data in enumerate(parsed_lines):
85
+ if line_data['end'] is None:
86
+ if i + 1 < len(parsed_lines):
87
+ # Use next line's start time as end time
88
+ line_data['end'] = parsed_lines[i + 1]['start']
89
+ elif total_duration is not None:
90
+ # Use total duration for last line
91
+ line_data['end'] = total_duration
92
+ else:
93
+ # Default: add 5 seconds if no duration info
94
+ line_data['end'] = line_data['start'] + 5.0
95
+
96
+ subtitles.append({
97
+ 'text': line_data['text'],
98
+ 'timestamp': [line_data['start'], line_data['end']]
99
+ })
100
+
101
+ return subtitles
102
+
103
+
104
  def _build_generation_info(
105
  lm_metadata: Optional[Dict[str, Any]],
106
  time_costs: Dict[str, float],
 
182
  # Post-processing time costs
183
  audio_conversion_time = time_costs.get('audio_conversion_time', 0.0)
184
  auto_score_time = time_costs.get('auto_score_time', 0.0)
185
+ auto_lrc_time = time_costs.get('auto_lrc_time', 0.0)
186
 
187
+ if audio_conversion_time > 0 or auto_score_time > 0 or auto_lrc_time > 0:
188
  time_lines.append("\n**🔧 Post-processing Time:**")
189
  if audio_conversion_time > 0:
190
  time_lines.append(f" - Audio Conversion: {audio_conversion_time:.2f}s")
191
  if auto_score_time > 0:
192
  time_lines.append(f" - Auto Score: {auto_score_time:.2f}s")
193
+ if auto_lrc_time > 0:
194
+ time_lines.append(f" - Auto LRC: {auto_lrc_time:.2f}s")
195
 
196
  # Pipeline total
197
  pipeline_total = time_costs.get('pipeline_total_time', 0.0)
 
362
  constrained_decoding_debug,
363
  allow_lm_batch,
364
  auto_score,
365
+ auto_lrc,
366
  score_scale,
367
  lm_batch_chunk_size,
368
  progress=gr.Progress(track_tqdm=True),
 
444
  # Initialize post-processing timing
445
  audio_conversion_start_time = time_module.time()
446
  total_auto_score_time = 0.0
447
+ total_auto_lrc_time = 0.0
448
+
449
+ # Initialize LRC storage for auto_lrc
450
+ final_lrcs_list = [""] * 8
451
+ final_subtitles_list = [None] * 8
452
 
453
  updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
454
 
 
462
  )
463
 
464
  if not result.success:
465
+ # Structure: 8 audio + batch_files + gen_info + status + seed + 8 scores + 8 codes_display + 8 accordions + 8 lrc_display + lm_meta + is_format + extra_outputs + raw_codes
466
+ yield (
467
+ (None,) * 8 + # audio outputs
468
+ (None, generation_info, result.status_message, gr.skip()) + # batch_files, gen_info, status, seed
469
+ (gr.skip(),) * 8 + # scores
470
+ (gr.skip(),) * 8 + # codes_display
471
+ (gr.skip(),) * 8 + # details_accordion
472
+ (gr.skip(),) * 8 + # lrc_display
473
+ (None, is_format_caption, None, None) # lm_meta, is_format, extra_outputs, raw_codes
474
+ )
475
  return
476
 
477
  audios = result.audios
478
  progress(0.99, "Converting audio to mp3...")
479
+
480
+ # Clear all scores, codes, and lrc displays at the start of generation
481
+ # Note: Create independent gr.update objects (not references to the same object)
482
+ clear_scores = [gr.update(value="", visible=False) for _ in range(8)]
483
+ clear_codes = [gr.update(value="", visible=False) for _ in range(8)]
484
+ clear_lrcs = [gr.update(value="", visible=False) for _ in range(8)]
485
+ clear_accordions = [gr.update(visible=False) for _ in range(8)]
486
+ yield (
487
+ # Audio outputs (keep as skip, will be updated in loop)
488
+ gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
489
+ None, # all_audio_paths (clear batch files)
490
+ generation_info,
491
+ "Clearing previous results...",
492
+ gr.skip(), # seed
493
+ # Clear scores
494
+ clear_scores[0], clear_scores[1], clear_scores[2], clear_scores[3],
495
+ clear_scores[4], clear_scores[5], clear_scores[6], clear_scores[7],
496
+ # Clear codes display
497
+ clear_codes[0], clear_codes[1], clear_codes[2], clear_codes[3],
498
+ clear_codes[4], clear_codes[5], clear_codes[6], clear_codes[7],
499
+ # Clear accordions
500
+ clear_accordions[0], clear_accordions[1], clear_accordions[2], clear_accordions[3],
501
+ clear_accordions[4], clear_accordions[5], clear_accordions[6], clear_accordions[7],
502
+ # Clear lrc displays
503
+ clear_lrcs[0], clear_lrcs[1], clear_lrcs[2], clear_lrcs[3],
504
+ clear_lrcs[4], clear_lrcs[5], clear_lrcs[6], clear_lrcs[7],
505
+ lm_generated_metadata,
506
+ is_format_caption,
507
+ None, # extra_outputs placeholder
508
+ None, # raw_codes placeholder
509
+ )
510
+
511
  for i in range(8):
512
  if i < len(audios):
513
  key = audios[i]["key"]
 
528
  code_str = audio_params.get("audio_codes", "")
529
  final_codes_list[i] = code_str
530
 
531
+ scores_ui_updates = [gr.skip() for _ in range(8)]
532
  score_str = "Done!"
533
  if auto_score:
534
  auto_score_start = time_module.time()
 
538
  scores_ui_updates[i] = score_str
539
  final_scores_list[i] = score_str
540
 
541
+ # Auto LRC generation
542
+ if auto_lrc:
543
+ auto_lrc_start = time_module.time()
544
+ logger.info(f"[auto_lrc] Starting LRC generation for sample {i + 1}")
545
+ try:
546
+ # Get extra_outputs for this sample
547
+ pred_latents = result.extra_outputs.get("pred_latents")
548
+ encoder_hidden_states = result.extra_outputs.get("encoder_hidden_states")
549
+ encoder_attention_mask = result.extra_outputs.get("encoder_attention_mask")
550
+ context_latents = result.extra_outputs.get("context_latents")
551
+ lyric_token_idss = result.extra_outputs.get("lyric_token_idss")
552
+
553
+ logger.info(f"[auto_lrc] pred_latents: {pred_latents is not None}, encoder_hidden_states: {encoder_hidden_states is not None}, encoder_attention_mask: {encoder_attention_mask is not None}, context_latents: {context_latents is not None}, lyric_token_idss: {lyric_token_idss is not None}")
554
+
555
+ if all(x is not None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
556
+ # Extract single sample tensors
557
+ sample_pred_latent = pred_latents[i:i+1]
558
+ sample_encoder_hidden_states = encoder_hidden_states[i:i+1]
559
+ sample_encoder_attention_mask = encoder_attention_mask[i:i+1]
560
+ sample_context_latents = context_latents[i:i+1]
561
+ sample_lyric_token_ids = lyric_token_idss[i:i+1]
562
+
563
+ # Calculate actual duration
564
+ actual_duration = audio_duration
565
+ if actual_duration is None or actual_duration <= 0:
566
+ latent_length = pred_latents.shape[1]
567
+ actual_duration = latent_length / 25.0 # 25 Hz latent rate
568
+
569
+ lrc_result = dit_handler.get_lyric_timestamp(
570
+ pred_latent=sample_pred_latent,
571
+ encoder_hidden_states=sample_encoder_hidden_states,
572
+ encoder_attention_mask=sample_encoder_attention_mask,
573
+ context_latents=sample_context_latents,
574
+ lyric_token_ids=sample_lyric_token_ids,
575
+ total_duration_seconds=float(actual_duration),
576
+ vocal_language=vocal_language or "en",
577
+ inference_steps=int(inference_steps),
578
+ seed=42,
579
+ )
580
+
581
+ logger.info(f"[auto_lrc] LRC result for sample {i + 1}: success={lrc_result.get('success')}")
582
+ if lrc_result.get("success"):
583
+ lrc_text = lrc_result.get("lrc_text", "")
584
+ final_lrcs_list[i] = lrc_text
585
+ logger.info(f"[auto_lrc] LRC text length for sample {i + 1}: {len(lrc_text)}")
586
+ # Parse LRC to subtitles format
587
+ subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=float(actual_duration))
588
+ final_subtitles_list[i] = subtitles_data
589
+ else:
590
+ logger.warning(f"[auto_lrc] Missing required extra_outputs for sample {i + 1}")
591
+ except Exception as e:
592
+ logger.warning(f"[auto_lrc] Failed to generate LRC for sample {i + 1}: {e}")
593
+ auto_lrc_end = time_module.time()
594
+ total_auto_lrc_time += (auto_lrc_end - auto_lrc_start)
595
+
596
  status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
597
+ current_audio_updates = [gr.skip() for _ in range(8)]
598
+ # Always set audio path first, subtitles will be applied via Audio component's subtitles parameter
599
  current_audio_updates[i] = audio_path
600
 
601
+ # Codes display updates (for results section)
602
+ codes_display_updates = [gr.skip() for _ in range(8)]
603
+ codes_display_updates[i] = gr.update(value=code_str, visible=bool(code_str))
604
+
605
+ # LRC display updates
606
+ lrc_display_updates = [gr.skip() for _ in range(8)]
607
+ has_lrc = bool(final_lrcs_list[i])
608
+ if auto_lrc and has_lrc:
609
+ lrc_display_updates[i] = gr.update(value=final_lrcs_list[i], visible=True)
610
+
611
+ # Details accordion updates (show if code OR lrc OR score exists)
612
+ details_accordion_updates = [gr.skip() for _ in range(8)]
613
+ has_score = bool(score_str) and score_str != "Done!"
614
+ has_content = bool(code_str) or has_lrc or has_score
615
+ details_accordion_updates[i] = gr.update(visible=has_content)
616
+
617
  yield (
618
  current_audio_updates[0], current_audio_updates[1], current_audio_updates[2], current_audio_updates[3],
619
  current_audio_updates[4], current_audio_updates[5], current_audio_updates[6], current_audio_updates[7],
 
623
  seed_value_for_ui,
624
  # Scores
625
  scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
626
+ # Codes display in results section
627
+ codes_display_updates[0], codes_display_updates[1], codes_display_updates[2], codes_display_updates[3],
628
+ codes_display_updates[4], codes_display_updates[5], codes_display_updates[6], codes_display_updates[7],
629
+ # Details accordion visibility
630
+ details_accordion_updates[0], details_accordion_updates[1], details_accordion_updates[2], details_accordion_updates[3],
631
+ details_accordion_updates[4], details_accordion_updates[5], details_accordion_updates[6], details_accordion_updates[7],
632
+ # LRC display
633
+ lrc_display_updates[0], lrc_display_updates[1], lrc_display_updates[2], lrc_display_updates[3],
634
+ lrc_display_updates[4], lrc_display_updates[5], lrc_display_updates[6], lrc_display_updates[7],
635
  lm_generated_metadata,
636
  is_format_caption,
637
  None, # Placeholder for extra_outputs (only filled in final yield)
638
+ None, # Placeholder for raw_codes_list (only filled in final yield)
639
  )
640
  else:
641
  # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
 
651
  time_costs['audio_conversion_time'] = audio_conversion_time
652
  if total_auto_score_time > 0:
653
  time_costs['auto_score_time'] = total_auto_score_time
654
+ if total_auto_lrc_time > 0:
655
+ time_costs['auto_lrc_time'] = total_auto_lrc_time
656
 
657
  # Update pipeline total time to include post-processing
658
  if 'pipeline_total_time' in time_costs:
659
+ time_costs['pipeline_total_time'] += audio_conversion_time + total_auto_score_time + total_auto_lrc_time
660
 
661
  # Rebuild generation_info with complete timing information
662
  generation_info = _build_generation_info(
 
667
  num_audios=len(result.audios),
668
  )
669
 
670
+ # Build final codes display, LRC display, and accordion visibility updates
671
+ final_codes_display_updates = []
672
+ final_lrc_display_updates = []
673
+ final_accordion_updates = []
674
+ for i in range(8):
675
+ code_str = final_codes_list[i]
676
+ lrc_text = final_lrcs_list[i]
677
+ score_str = final_scores_list[i]
678
+ has_code = bool(code_str)
679
+ has_lrc = bool(lrc_text)
680
+ has_score = bool(score_str) and score_str != "Done!"
681
+ # Show accordion if code OR LRC OR score exists
682
+ has_content = has_code or has_lrc or has_score
683
+ final_codes_display_updates.append(gr.update(value=code_str, visible=has_code))
684
+ final_lrc_display_updates.append(gr.update(value=lrc_text, visible=has_lrc))
685
+ final_accordion_updates.append(gr.update(visible=has_content))
686
+
687
  yield (
688
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 1-4: SKIP
689
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 5-8: SKIP
 
693
  seed_value_for_ui,
694
  final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
695
  final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
696
+ # Codes display in results section
697
+ final_codes_display_updates[0], final_codes_display_updates[1], final_codes_display_updates[2], final_codes_display_updates[3],
698
+ final_codes_display_updates[4], final_codes_display_updates[5], final_codes_display_updates[6], final_codes_display_updates[7],
699
+ # Details accordion visibility
700
+ final_accordion_updates[0], final_accordion_updates[1], final_accordion_updates[2], final_accordion_updates[3],
701
+ final_accordion_updates[4], final_accordion_updates[5], final_accordion_updates[6], final_accordion_updates[7],
702
+ # LRC display
703
+ final_lrc_display_updates[0], final_lrc_display_updates[1], final_lrc_display_updates[2], final_lrc_display_updates[3],
704
+ final_lrc_display_updates[4], final_lrc_display_updates[5], final_lrc_display_updates[6], final_lrc_display_updates[7],
705
  lm_generated_metadata,
706
  is_format_caption,
707
+ {
708
+ **result.extra_outputs,
709
+ "lrcs": final_lrcs_list,
710
+ "subtitles": final_subtitles_list,
711
+ }, # extra_outputs for LRC generation (with auto_lrc results)
712
+ final_codes_list, # Raw codes list for batch storage (index 47)
713
  )
714
 
715
 
 
891
 
892
  This function retrieves cached generation data from batch_queue and calls
893
  the handler's get_lyric_timestamp method to generate LRC format lyrics.
894
+ Audio subtitles are automatically updated via lrc_display.change() event.
895
 
896
  Args:
897
  dit_handler: DiT handler instance with get_lyric_timestamp method
 
902
  inference_steps: Number of inference steps used in generation
903
 
904
  Returns:
905
+ Tuple of (lrc_display_update, details_accordion_update, batch_queue)
906
  """
907
  import torch
908
 
909
  if current_batch_index not in batch_queue:
910
+ return gr.skip(), gr.skip(), batch_queue
911
 
912
  batch_data = batch_queue[current_batch_index]
913
  extra_outputs = batch_data.get("extra_outputs", {})
914
 
915
  # Check if required data is available
916
  if not extra_outputs:
917
+ return gr.update(value=t("messages.lrc_no_extra_outputs"), visible=True), gr.update(visible=True), batch_queue
918
 
919
  pred_latents = extra_outputs.get("pred_latents")
920
  encoder_hidden_states = extra_outputs.get("encoder_hidden_states")
 
923
  lyric_token_idss = extra_outputs.get("lyric_token_idss")
924
 
925
  if any(x is None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
926
+ return gr.update(value=t("messages.lrc_missing_tensors"), visible=True), gr.update(visible=True), batch_queue
927
 
928
  # Adjust sample_idx to 0-based
929
  sample_idx_0based = sample_idx - 1
 
931
  # Check if sample exists in batch
932
  batch_size = pred_latents.shape[0]
933
  if sample_idx_0based >= batch_size:
934
+ return gr.update(value=t("messages.lrc_sample_not_exist"), visible=True), gr.update(visible=True), batch_queue
935
 
936
  # Extract the specific sample's data
937
  try:
 
969
  if result.get("success"):
970
  lrc_text = result.get("lrc_text", "")
971
  if not lrc_text:
972
+ return gr.update(value=t("messages.lrc_empty_result"), visible=True), gr.update(visible=True), batch_queue
973
+
974
+ # Store LRC in batch_queue for later retrieval when switching batches
975
+ if "lrcs" not in batch_queue[current_batch_index]:
976
+ batch_queue[current_batch_index]["lrcs"] = [""] * 8
977
+ batch_queue[current_batch_index]["lrcs"][sample_idx_0based] = lrc_text
978
+
979
+ # Parse LRC to subtitles format for storage (audio subtitles will be updated via lrc_display.change())
980
+ subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=float(audio_duration))
981
+
982
+ # Store subtitles in batch_queue for batch navigation
983
+ if "subtitles" not in batch_queue[current_batch_index]:
984
+ batch_queue[current_batch_index]["subtitles"] = [None] * 8
985
+ batch_queue[current_batch_index]["subtitles"][sample_idx_0based] = subtitles_data
986
+
987
+ # Return: lrc_display, details_accordion, batch_queue
988
+ # Audio subtitles are automatically updated via lrc_display.change() event
989
+ return (
990
+ gr.update(value=lrc_text, visible=True),
991
+ gr.update(visible=True),
992
+ batch_queue
993
+ )
994
  else:
995
  error_msg = result.get("error", "Unknown error")
996
+ return gr.update(value=f"❌ {error_msg}", visible=True), gr.update(visible=True), batch_queue
997
 
998
  except Exception as e:
999
  logger.exception("[generate_lrc_handler] Error generating LRC")
1000
+ return gr.update(value=f"❌ Error: {str(e)}", visible=True), gr.update(visible=True), batch_queue
1001
+
1002
+
1003
+ def update_audio_subtitles_from_lrc(lrc_text: str, audio_component_value, audio_duration: float = None):
1004
+ """
1005
+ Update Audio component's subtitles based on LRC text content.
1006
+
1007
+ This function is triggered when lrc_display textbox changes.
1008
+ It parses the LRC text and updates the corresponding Audio component's subtitles.
1009
+
1010
+ Args:
1011
+ lrc_text: LRC format lyrics string from lrc_display textbox
1012
+ audio_component_value: Current value of the audio component (path or dict)
1013
+ audio_duration: Optional audio duration for calculating last line's end time
1014
+
1015
+ Returns:
1016
+ gr.update for the Audio component with subtitles
1017
+ """
1018
+ # If no LRC text, skip update (don't clear subtitles to avoid flickering)
1019
+ if not lrc_text or not lrc_text.strip():
1020
+ return gr.skip()
1021
+
1022
+ # Get audio path from component value
1023
+ audio_path = None
1024
+ if audio_component_value:
1025
+ if isinstance(audio_component_value, dict):
1026
+ audio_path = audio_component_value.get("path") or audio_component_value.get("value")
1027
+ else:
1028
+ audio_path = audio_component_value
1029
+
1030
+ if not audio_path:
1031
+ return gr.skip()
1032
+
1033
+ # Parse LRC to subtitles format
1034
+ subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=audio_duration)
1035
+
1036
+ # Return updated audio with subtitles
1037
+ return gr.update(value=audio_path, subtitles=subtitles_data if subtitles_data else None)
1038
 
1039
 
1040
  def capture_current_params(
 
1046
  use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
1047
  think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
1048
  use_cot_metas, use_cot_caption, use_cot_language,
1049
+ constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
1050
  track_name, complete_track_classes
1051
  ):
1052
  """Capture current UI parameters for next batch generation
 
1093
  "constrained_decoding_debug": constrained_decoding_debug,
1094
  "allow_lm_batch": allow_lm_batch,
1095
  "auto_score": auto_score,
1096
+ "auto_lrc": auto_lrc,
1097
  "score_scale": score_scale,
1098
  "lm_batch_chunk_size": lm_batch_chunk_size,
1099
  "track_name": track_name,
 
1114
  constrained_decoding_debug,
1115
  allow_lm_batch,
1116
  auto_score,
1117
+ auto_lrc,
1118
  score_scale,
1119
  lm_batch_chunk_size,
1120
  track_name,
 
1143
  constrained_decoding_debug,
1144
  allow_lm_batch,
1145
  auto_score,
1146
+ auto_lrc,
1147
  score_scale,
1148
  lm_batch_chunk_size,
1149
  progress
 
1153
  final_result_from_inner = partial_result
1154
  # current_batch_index, total_batches, batch_queue, next_params,
1155
  # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
1156
+ # Slice off extra_outputs and raw_codes_list (last 2 items) before re-yielding to UI
1157
+ ui_result = partial_result[:-2] if len(partial_result) > 47 else (partial_result[:-1] if len(partial_result) > 46 else partial_result)
1158
  yield ui_result + (
1159
  gr.skip(), gr.skip(), gr.skip(), gr.skip(),
1160
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
 
1163
  all_audio_paths = result[8]
1164
 
1165
  if all_audio_paths is None:
1166
+ # Slice off extra_outputs and raw_codes_list before yielding to UI
1167
+ ui_result = result[:-2] if len(result) > 47 else (result[:-1] if len(result) > 46 else result)
1168
  yield ui_result + (
1169
  gr.skip(), gr.skip(), gr.skip(), gr.skip(),
1170
  gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
 
1172
  return
1173
 
1174
  # Extract results from generation (使用 result 下标访问)
1175
+ # New structure after UI refactor (with lrc_display added):
1176
+ # 0-7: audio_outputs, 8: all_audio_paths, 9: generation_info, 10: status, 11: seed
1177
+ # 12-19: scores, 20-27: codes_display, 28-35: details_accordion, 36-43: lrc_display
1178
+ # 44: lm_metadata, 45: is_format_caption, 46: extra_outputs, 47: raw_codes_list
1179
  generation_info = result[9]
1180
  seed_value_for_ui = result[11]
1181
+ lm_generated_metadata = result[44]
1182
 
1183
+ # Extract raw codes list directly (index 47)
1184
+ raw_codes_list = result[47] if len(result) > 47 else [""] * 8
1185
+ generated_codes_batch = raw_codes_list if isinstance(raw_codes_list, list) else [""] * 8
1186
+ generated_codes_single = generated_codes_batch[0] if generated_codes_batch else ""
1187
 
1188
  # Determine which codes to store based on mode
1189
  if allow_lm_batch and batch_size_input >= 2:
 
1230
  "constrained_decoding_debug": constrained_decoding_debug,
1231
  "allow_lm_batch": allow_lm_batch,
1232
  "auto_score": auto_score,
1233
+ "auto_lrc": auto_lrc,
1234
  "score_scale": score_scale,
1235
  "lm_batch_chunk_size": lm_batch_chunk_size,
1236
  "track_name": track_name,
 
1243
  next_params["text2music_audio_code_string"] = ""
1244
  next_params["random_seed_checkbox"] = True
1245
 
1246
+ # Extract extra_outputs from result tuple (index 46 after adding lrc_display)
1247
+ # Note: index 47 is raw_codes_list which we already extracted above
1248
+ extra_outputs_from_result = result[46] if len(result) > 46 else {}
1249
 
1250
  # Store current batch in queue
1251
  batch_queue = store_batch_in_queue(
 
1263
  status="completed"
1264
  )
1265
 
1266
+ # Extract auto_lrc results from extra_outputs (generated in generate_with_progress)
1267
+ if auto_lrc and extra_outputs_from_result:
1268
+ lrcs_from_extra = extra_outputs_from_result.get("lrcs", [""] * 8)
1269
+ subtitles_from_extra = extra_outputs_from_result.get("subtitles", [None] * 8)
1270
+ batch_queue[current_batch_index]["lrcs"] = lrcs_from_extra
1271
+ batch_queue[current_batch_index]["subtitles"] = subtitles_from_extra
1272
+
1273
  # Update batch counters
1274
  total_batches = max(total_batches, current_batch_index + 1)
1275
 
 
1286
 
1287
  # 4. Yield final result (includes Batch UI updates)
1288
  # The result here is already a tuple structure
1289
+ # Slice off extra_outputs and raw_codes_list (last 2 items) before yielding to UI - they're already stored in batch_queue
1290
+ # New structure (with lrc_display):
1291
+ # 0-7: audio_outputs, 8: all_audio_paths, 9: generation_info, 10: status, 11: seed
1292
+ # 12-19: scores, 20-27: codes_display, 28-35: details_accordion, 36-43: lrc_display
1293
+ # 44: lm_metadata, 45: is_format_caption, 46: extra_outputs, 47: raw_codes_list
1294
+ # Note: Audio subtitles are already included in the intermediate yields from generate_with_progress
1295
+ ui_result = result[:-2] if len(result) > 47 else (result[:-1] if len(result) > 46 else result)
1296
+
1297
  yield ui_result + (
1298
  current_batch_index,
1299
  total_batches,
 
1405
  params.setdefault("constrained_decoding_debug", False)
1406
  params.setdefault("allow_lm_batch", True)
1407
  params.setdefault("auto_score", False)
1408
+ params.setdefault("auto_lrc", False)
1409
  params.setdefault("score_scale", 0.5)
1410
  params.setdefault("lm_batch_chunk_size", 8)
1411
  params.setdefault("track_name", None)
 
1454
  constrained_decoding_debug=params.get("constrained_decoding_debug"),
1455
  allow_lm_batch=params.get("allow_lm_batch"),
1456
  auto_score=params.get("auto_score"),
1457
+ auto_lrc=params.get("auto_lrc"),
1458
  score_scale=params.get("score_scale"),
1459
  lm_batch_chunk_size=params.get("lm_batch_chunk_size"),
1460
  progress=progress
 
1466
  final_result = partial_result
1467
 
1468
  # Extract results from final_result
1469
+ # New structure after UI refactor (with lrc_display added):
1470
+ # 0-7: audio_outputs, 8: all_audio_paths, 9: generation_info, 10: status, 11: seed
1471
+ # 12-19: scores, 20-27: codes_display, 28-35: details_accordion, 36-43: lrc_display
1472
+ # 44: lm_metadata, 45: is_format_caption, 46: extra_outputs, 47: raw_codes_list
1473
  all_audio_paths = final_result[8] # generated_audio_batch
1474
  generation_info = final_result[9]
1475
  seed_value_for_ui = final_result[11]
1476
+ lm_generated_metadata = final_result[44]
1477
+
1478
+ # Extract raw codes list directly (index 47)
1479
+ raw_codes_list = final_result[47] if len(final_result) > 47 else [""] * 8
1480
+ generated_codes_batch = raw_codes_list if isinstance(raw_codes_list, list) else [""] * 8
1481
+ generated_codes_single = generated_codes_batch[0] if generated_codes_batch else ""
1482
 
1483
+ # Extract extra_outputs for LRC generation (index 46)
1484
+ extra_outputs_from_bg = final_result[46] if len(final_result) > 46 else None
 
1485
 
1486
  # Determine which codes to store
1487
  batch_size = params.get("batch_size_input", 2)
 
1496
  logger.info(f" - allow_lm_batch: {allow_lm_batch}")
1497
  logger.info(f" - batch_size: {batch_size}")
1498
  logger.info(f" - generated_codes_single exists: {bool(generated_codes_single)}")
1499
+ logger.info(f" - extra_outputs_from_bg exists: {extra_outputs_from_bg is not None}")
1500
  if isinstance(codes_to_store, list):
1501
  logger.info(f" - codes_to_store: LIST with {len(codes_to_store)} items")
1502
  for idx, code in enumerate(codes_to_store):
 
1505
  logger.info(f" - codes_to_store: STRING with {len(codes_to_store) if codes_to_store else 0} chars")
1506
 
1507
  # Store next batch in queue with codes, batch settings, and ALL generation params
 
1508
  batch_queue = store_batch_in_queue(
1509
  batch_queue,
1510
  next_batch_idx,
 
1516
  batch_size=int(batch_size),
1517
  generation_params=params,
1518
  lm_generated_metadata=lm_generated_metadata,
1519
+ extra_outputs=extra_outputs_from_bg, # Now properly extracted from generation result
1520
  status="completed"
1521
  )
1522
 
 
1557
  """Navigate to previous batch (Result View Only - Never touches Input UI)"""
1558
  if current_batch_index <= 0:
1559
  gr.Warning(t("messages.at_first_batch"))
1560
+ return [gr.update()] * 48 # 8 audio + 2 batch files/info + 1 index + 1 indicator + 2 btns + 1 status + 8 scores + 8 codes + 8 lrc + 8 accordions + 1 restore
1561
 
1562
  # Move to previous batch
1563
  new_batch_index = current_batch_index - 1
 
1565
  # Load batch data from queue
1566
  if new_batch_index not in batch_queue:
1567
  gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
1568
+ return [gr.update()] * 48
1569
 
1570
  batch_data = batch_queue[new_batch_index]
1571
  audio_paths = batch_data.get("audio_paths", [])
1572
  generation_info_text = batch_data.get("generation_info", "")
1573
 
1574
+ # Prepare audio outputs (up to 8) with subtitles
 
1575
  real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
1576
+ stored_subtitles = batch_data.get("subtitles", [None] * 8)
1577
+
1578
+ audio_updates = []
1579
+ for idx in range(8):
1580
+ if idx < len(real_audio_paths):
1581
+ audio_path = real_audio_paths[idx]
1582
+ subtitles_data = stored_subtitles[idx] if idx < len(stored_subtitles) else None
1583
+ # Use gr.update to set both value and subtitles
1584
+ audio_updates.append(gr.update(value=audio_path, subtitles=subtitles_data))
1585
+ else:
1586
+ audio_updates.append(gr.update(value=None, subtitles=None))
1587
 
1588
  # Update batch indicator
1589
  total_batches = len(batch_queue)
 
1596
  stored_scores = batch_data.get("scores", [""] * 8)
1597
  score_displays = stored_scores if stored_scores else [""] * 8
1598
 
1599
+ # Restore LRC displays from batch queue (clear if not stored)
1600
+ stored_lrcs = batch_data.get("lrcs", [""] * 8)
1601
+ lrc_displays = stored_lrcs if stored_lrcs else [""] * 8
1602
+
1603
+ # Restore codes display from batch queue
1604
+ stored_codes = batch_data.get("codes", "")
1605
+ stored_allow_lm_batch = batch_data.get("allow_lm_batch", False)
1606
+ batch_size = batch_data.get("batch_size", 2)
1607
+
1608
+ codes_display_updates = []
1609
+ lrc_display_updates = []
1610
+ details_accordion_updates = []
1611
+ for i in range(8):
1612
+ if stored_allow_lm_batch and isinstance(stored_codes, list):
1613
+ code_str = stored_codes[i] if i < len(stored_codes) else ""
1614
+ else:
1615
+ code_str = stored_codes if isinstance(stored_codes, str) and i == 0 else ""
1616
+
1617
+ lrc_str = lrc_displays[i] if i < len(lrc_displays) else ""
1618
+ score_str = score_displays[i] if i < len(score_displays) else ""
1619
+
1620
+ has_code = bool(code_str) and i < batch_size
1621
+ has_lrc = bool(lrc_str)
1622
+ has_score = bool(score_str)
1623
+
1624
+ # Show accordion if any content exists
1625
+ has_content = has_code or has_lrc or has_score
1626
+
1627
+ codes_display_updates.append(gr.update(value=code_str, visible=has_code))
1628
+ lrc_display_updates.append(gr.update(value=lrc_str, visible=has_lrc))
1629
+ details_accordion_updates.append(gr.update(visible=has_content))
1630
+
1631
  return (
1632
+ audio_updates[0], audio_updates[1], audio_updates[2], audio_updates[3],
1633
+ audio_updates[4], audio_updates[5], audio_updates[6], audio_updates[7],
1634
  audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
1635
  gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
1636
  t("messages.viewing_batch", n=new_batch_index + 1),
1637
  score_displays[0], score_displays[1], score_displays[2], score_displays[3],
1638
  score_displays[4], score_displays[5], score_displays[6], score_displays[7],
1639
+ codes_display_updates[0], codes_display_updates[1], codes_display_updates[2], codes_display_updates[3],
1640
+ codes_display_updates[4], codes_display_updates[5], codes_display_updates[6], codes_display_updates[7],
1641
+ lrc_display_updates[0], lrc_display_updates[1], lrc_display_updates[2], lrc_display_updates[3],
1642
+ lrc_display_updates[4], lrc_display_updates[5], lrc_display_updates[6], lrc_display_updates[7],
1643
+ details_accordion_updates[0], details_accordion_updates[1], details_accordion_updates[2], details_accordion_updates[3],
1644
+ details_accordion_updates[4], details_accordion_updates[5], details_accordion_updates[6], details_accordion_updates[7],
1645
  gr.update(interactive=True),
1646
  )
1647
 
 
1650
  """Navigate to next batch (Result View Only - Never touches Input UI)"""
1651
  if current_batch_index >= total_batches - 1:
1652
  gr.Warning(t("messages.at_last_batch"))
1653
+ return [gr.update()] * 49 # 8 audio + 2 batch files/info + 1 index + 1 indicator + 2 btns + 1 status + 1 next_status + 8 scores + 8 codes + 8 lrc + 8 accordions + 1 restore
1654
 
1655
  # Move to next batch
1656
  new_batch_index = current_batch_index + 1
 
1658
  # Load batch data from queue
1659
  if new_batch_index not in batch_queue:
1660
  gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
1661
+ return [gr.update()] * 49
1662
 
1663
  batch_data = batch_queue[new_batch_index]
1664
  audio_paths = batch_data.get("audio_paths", [])
1665
  generation_info_text = batch_data.get("generation_info", "")
1666
 
1667
+ # Prepare audio outputs (up to 8) with subtitles
 
1668
  real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
1669
+ stored_subtitles = batch_data.get("subtitles", [None] * 8)
1670
+
1671
+ audio_updates = []
1672
+ for idx in range(8):
1673
+ if idx < len(real_audio_paths):
1674
+ audio_path = real_audio_paths[idx]
1675
+ subtitles_data = stored_subtitles[idx] if idx < len(stored_subtitles) else None
1676
+ # Use gr.update to set both value and subtitles
1677
+ audio_updates.append(gr.update(value=audio_path, subtitles=subtitles_data))
1678
+ else:
1679
+ audio_updates.append(gr.update(value=None, subtitles=None))
1680
 
1681
  # Update batch indicator
1682
  batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)
 
1694
  stored_scores = batch_data.get("scores", [""] * 8)
1695
  score_displays = stored_scores if stored_scores else [""] * 8
1696
 
1697
+ # Restore LRC displays from batch queue (clear if not stored)
1698
+ stored_lrcs = batch_data.get("lrcs", [""] * 8)
1699
+ lrc_displays = stored_lrcs if stored_lrcs else [""] * 8
1700
+
1701
+ # Restore codes display from batch queue
1702
+ stored_codes = batch_data.get("codes", "")
1703
+ stored_allow_lm_batch = batch_data.get("allow_lm_batch", False)
1704
+ batch_size = batch_data.get("batch_size", 2)
1705
+
1706
+ codes_display_updates = []
1707
+ lrc_display_updates = []
1708
+ details_accordion_updates = []
1709
+ for i in range(8):
1710
+ if stored_allow_lm_batch and isinstance(stored_codes, list):
1711
+ code_str = stored_codes[i] if i < len(stored_codes) else ""
1712
+ else:
1713
+ code_str = stored_codes if isinstance(stored_codes, str) and i == 0 else ""
1714
+
1715
+ lrc_str = lrc_displays[i] if i < len(lrc_displays) else ""
1716
+ score_str = score_displays[i] if i < len(score_displays) else ""
1717
+
1718
+ has_code = bool(code_str) and i < batch_size
1719
+ has_lrc = bool(lrc_str)
1720
+ has_score = bool(score_str)
1721
+
1722
+ # Show accordion if any content exists
1723
+ has_content = has_code or has_lrc or has_score
1724
+
1725
+ codes_display_updates.append(gr.update(value=code_str, visible=has_code))
1726
+ lrc_display_updates.append(gr.update(value=lrc_str, visible=has_lrc))
1727
+ details_accordion_updates.append(gr.update(visible=has_content))
1728
+
1729
  return (
1730
+ audio_updates[0], audio_updates[1], audio_updates[2], audio_updates[3],
1731
+ audio_updates[4], audio_updates[5], audio_updates[6], audio_updates[7],
1732
  audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
1733
  gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
1734
  t("messages.viewing_batch", n=new_batch_index + 1), next_batch_status_text,
1735
  score_displays[0], score_displays[1], score_displays[2], score_displays[3],
1736
  score_displays[4], score_displays[5], score_displays[6], score_displays[7],
1737
+ codes_display_updates[0], codes_display_updates[1], codes_display_updates[2], codes_display_updates[3],
1738
+ codes_display_updates[4], codes_display_updates[5], codes_display_updates[6], codes_display_updates[7],
1739
+ lrc_display_updates[0], lrc_display_updates[1], lrc_display_updates[2], lrc_display_updates[3],
1740
+ lrc_display_updates[4], lrc_display_updates[5], lrc_display_updates[6], lrc_display_updates[7],
1741
+ details_accordion_updates[0], details_accordion_updates[1], details_accordion_updates[2], details_accordion_updates[3],
1742
+ details_accordion_updates[4], details_accordion_updates[5], details_accordion_updates[6], details_accordion_updates[7],
1743
  gr.update(interactive=True),
1744
  )
1745
 
 
1751
  """
1752
  if current_batch_index not in batch_queue:
1753
  gr.Warning(t("messages.no_batch_data"))
1754
+ return [gr.update()] * 20 # Updated count: 1 codes + 19 other params
1755
 
1756
  batch_data = batch_queue[current_batch_index]
1757
  params = batch_data.get("generation_params", {})
 
1777
  track_name = params.get("track_name", None)
1778
  complete_track_classes = params.get("complete_track_classes", [])
1779
 
1780
+ # Extract codes - only restore to single input
1781
  stored_codes = batch_data.get("codes", "")
 
 
 
1782
  if stored_codes:
1783
+ if isinstance(stored_codes, list):
1784
+ # Batch mode: use first codes for single input
1785
+ codes_main = stored_codes[0] if stored_codes else ""
 
 
1786
  else:
1787
+ # Single mode
1788
+ codes_main = stored_codes
1789
+ else:
1790
+ codes_main = ""
1791
 
1792
  gr.Info(t("messages.params_restored", n=current_batch_index + 1))
1793
 
1794
  return (
1795
+ codes_main, captions, lyrics, bpm, key_scale, time_signature,
 
 
1796
  vocal_language, audio_duration, batch_size_input, inference_steps,
1797
  lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, think_checkbox,
1798
  use_cot_caption, use_cot_language, allow_lm_batch,
acestep/gradio_ui/i18n/en.json CHANGED
@@ -140,6 +140,8 @@
140
  "constrained_debug_info": "Enable debug logging for constrained decoding (check to see detailed logs)",
141
  "auto_score_label": "Auto Score",
142
  "auto_score_info": "Automatically calculate quality scores for all generated audios",
 
 
143
  "lm_batch_chunk_label": "LM Batch Chunk Size",
144
  "lm_batch_chunk_info": "Max items per LM batch chunk (default: 8, limited by GPU memory)",
145
  "codes_strength_label": "LM Codes Strength",
@@ -163,9 +165,10 @@
163
  "lrc_btn": "🎵 LRC",
164
  "quality_score_label": "Quality Score (Sample {n})",
165
  "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
 
166
  "lrc_label": "Lyrics Timestamps (Sample {n})",
167
  "lrc_placeholder": "Click 'LRC' to generate timestamps",
168
- "details_accordion": "📊 Score & LRC",
169
  "generation_status": "Generation Status",
170
  "current_batch": "Current Batch",
171
  "batch_indicator": "Batch {current} / {total}",
 
140
  "constrained_debug_info": "Enable debug logging for constrained decoding (check to see detailed logs)",
141
  "auto_score_label": "Auto Score",
142
  "auto_score_info": "Automatically calculate quality scores for all generated audios",
143
+ "auto_lrc_label": "Auto LRC",
144
+ "auto_lrc_info": "Automatically generate LRC lyrics timestamps for all generated audios",
145
  "lm_batch_chunk_label": "LM Batch Chunk Size",
146
  "lm_batch_chunk_info": "Max items per LM batch chunk (default: 8, limited by GPU memory)",
147
  "codes_strength_label": "LM Codes Strength",
 
165
  "lrc_btn": "🎵 LRC",
166
  "quality_score_label": "Quality Score (Sample {n})",
167
  "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
168
+ "codes_label": "LM Codes (Sample {n})",
169
  "lrc_label": "Lyrics Timestamps (Sample {n})",
170
  "lrc_placeholder": "Click 'LRC' to generate timestamps",
171
+ "details_accordion": "📊 Score & LRC & LM Codes",
172
  "generation_status": "Generation Status",
173
  "current_batch": "Current Batch",
174
  "batch_indicator": "Batch {current} / {total}",
acestep/gradio_ui/i18n/ja.json CHANGED
@@ -140,6 +140,8 @@
140
  "constrained_debug_info": "制約付きデコーディングのデバッグログを有効化(チェックすると詳細ログを表示)",
141
  "auto_score_label": "自動スコアリング",
142
  "auto_score_info": "生成されたすべてのオーディオの品質スコアを自動計算",
 
 
143
  "lm_batch_chunk_label": "LM バッチチャンクサイズ",
144
  "lm_batch_chunk_info": "LMバッチチャンクあたりの最大アイテム数(デフォルト: 8、GPUメモリによる制限)",
145
  "codes_strength_label": "LM コード強度",
@@ -163,9 +165,10 @@
163
  "lrc_btn": "🎵 LRC",
164
  "quality_score_label": "品質スコア(サンプル {n})",
165
  "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
 
166
  "lrc_label": "歌詞タイムスタンプ(サンプル {n})",
167
  "lrc_placeholder": "'LRC'をクリックしてタイムスタンプを生成",
168
- "details_accordion": "📊 スコア & LRC",
169
  "generation_status": "生成ステータス",
170
  "current_batch": "現在のバッチ",
171
  "batch_indicator": "バッチ {current} / {total}",
 
140
  "constrained_debug_info": "制約付きデコーディングのデバッグログを有効化(チェックすると詳細ログを表示)",
141
  "auto_score_label": "自動スコアリング",
142
  "auto_score_info": "生成されたすべてのオーディオの品質スコアを自動計算",
143
+ "auto_lrc_label": "自動 LRC",
144
+ "auto_lrc_info": "生成されたすべてのオーディオのLRC歌詞タイムスタンプを自動生成",
145
  "lm_batch_chunk_label": "LM バッチチャンクサイズ",
146
  "lm_batch_chunk_info": "LMバッチチャンクあたりの最大アイテム数(デフォルト: 8、GPUメモリによる制限)",
147
  "codes_strength_label": "LM コード強度",
 
165
  "lrc_btn": "🎵 LRC",
166
  "quality_score_label": "品質スコア(サンプル {n})",
167
  "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
168
+ "codes_label": "LM コード(サンプル {n})",
169
  "lrc_label": "歌詞タイムスタンプ(サンプル {n})",
170
  "lrc_placeholder": "'LRC'をクリックしてタイムスタンプを生成",
171
+ "details_accordion": "📊 スコア & LRC & LM コード",
172
  "generation_status": "生成ステータス",
173
  "current_batch": "現在のバッチ",
174
  "batch_indicator": "バッチ {current} / {total}",
acestep/gradio_ui/i18n/zh.json CHANGED
@@ -140,6 +140,8 @@
140
  "constrained_debug_info": "启用约束解码的调试日志(勾选以查看详细日志)",
141
  "auto_score_label": "自动评分",
142
  "auto_score_info": "自动计算所有生成音频的质量分数",
 
 
143
  "lm_batch_chunk_label": "LM 批量块大小",
144
  "lm_batch_chunk_info": "每个LM批量块的最大项目数(默认: 8, 受GPU内存限制)",
145
  "codes_strength_label": "LM 代码强度",
@@ -163,9 +165,10 @@
163
  "lrc_btn": "🎵 LRC",
164
  "quality_score_label": "质量分数(样本 {n})",
165
  "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
 
166
  "lrc_label": "歌词时间戳(样本 {n})",
167
  "lrc_placeholder": "点击'LRC'生成时间戳",
168
- "details_accordion": "📊 评分与LRC",
169
  "generation_status": "生成状态",
170
  "current_batch": "当前批次",
171
  "batch_indicator": "批次 {current} / {total}",
 
140
  "constrained_debug_info": "启用约束解码的调试日志(勾选以查看详细日志)",
141
  "auto_score_label": "自动评分",
142
  "auto_score_info": "自动计算所有生成音频的质量分数",
143
+ "auto_lrc_label": "自动 LRC",
144
+ "auto_lrc_info": "自动为所有生成的音频生成LRC歌词时间戳",
145
  "lm_batch_chunk_label": "LM 批量块大小",
146
  "lm_batch_chunk_info": "每个LM批量块的最大项目数(默认: 8, 受GPU内存限制)",
147
  "codes_strength_label": "LM 代码强度",
 
165
  "lrc_btn": "🎵 LRC",
166
  "quality_score_label": "质量分数(样本 {n})",
167
  "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
168
+ "codes_label": "LM 代码(样本 {n})",
169
  "lrc_label": "歌词时间戳(样本 {n})",
170
  "lrc_placeholder": "点击'LRC'生成时间戳",
171
+ "details_accordion": "📊 评分与LRC与LM代码",
172
  "generation_status": "生成状态",
173
  "current_batch": "当前批次",
174
  "batch_indicator": "批次 {current} / {total}",
acestep/gradio_ui/interfaces/generation.py CHANGED
@@ -218,10 +218,9 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
218
  size="sm"
219
  )
220
 
221
- # Audio Codes for text2music (dynamic display based on batch size and allow_lm_batch)
222
  with gr.Accordion(t("generation.lm_codes_hints"), open=False, visible=True) as text2music_audio_codes_group:
223
- # Single codes input (default mode)
224
- with gr.Row(equal_height=True, visible=True) as codes_single_row:
225
  text2music_audio_code_string = gr.Textbox(
226
  label=t("generation.lm_codes_label"),
227
  placeholder=t("generation.lm_codes_placeholder"),
@@ -235,68 +234,6 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
235
  size="sm",
236
  scale=1,
237
  )
238
-
239
- # Multiple codes inputs (batch mode when allow_lm_batch is enabled)
240
- with gr.Row(visible=False) as codes_batch_row:
241
- with gr.Column(visible=True) as codes_col_1:
242
- text2music_audio_code_string_1 = gr.Textbox(
243
- label=t("generation.lm_codes_sample", n=1),
244
- placeholder="<|audio_code_...|>",
245
- lines=4,
246
- info=t("generation.lm_codes_sample_info", n=1),
247
- )
248
- with gr.Column(visible=True) as codes_col_2:
249
- text2music_audio_code_string_2 = gr.Textbox(
250
- label=t("generation.lm_codes_sample", n=2),
251
- placeholder="<|audio_code_...|>",
252
- lines=4,
253
- info=t("generation.lm_codes_sample_info", n=2),
254
- )
255
- with gr.Column(visible=False) as codes_col_3:
256
- text2music_audio_code_string_3 = gr.Textbox(
257
- label=t("generation.lm_codes_sample", n=3),
258
- placeholder="<|audio_code_...|>",
259
- lines=4,
260
- info=t("generation.lm_codes_sample_info", n=3),
261
- )
262
- with gr.Column(visible=False) as codes_col_4:
263
- text2music_audio_code_string_4 = gr.Textbox(
264
- label=t("generation.lm_codes_sample", n=4),
265
- placeholder="<|audio_code_...|>",
266
- lines=4,
267
- info=t("generation.lm_codes_sample_info", n=4),
268
- )
269
-
270
- # Additional row for codes 5-8
271
- with gr.Row(visible=False) as codes_batch_row_2:
272
- with gr.Column() as codes_col_5:
273
- text2music_audio_code_string_5 = gr.Textbox(
274
- label=t("generation.lm_codes_sample", n=5),
275
- placeholder="<|audio_code_...|>",
276
- lines=4,
277
- info=t("generation.lm_codes_sample_info", n=5),
278
- )
279
- with gr.Column() as codes_col_6:
280
- text2music_audio_code_string_6 = gr.Textbox(
281
- label=t("generation.lm_codes_sample", n=6),
282
- placeholder="<|audio_code_...|>",
283
- lines=4,
284
- info=t("generation.lm_codes_sample_info", n=6),
285
- )
286
- with gr.Column() as codes_col_7:
287
- text2music_audio_code_string_7 = gr.Textbox(
288
- label=t("generation.lm_codes_sample", n=7),
289
- placeholder="<|audio_code_...|>",
290
- lines=4,
291
- info=t("generation.lm_codes_sample_info", n=7),
292
- )
293
- with gr.Column() as codes_col_8:
294
- text2music_audio_code_string_8 = gr.Textbox(
295
- label=t("generation.lm_codes_sample", n=8),
296
- placeholder="<|audio_code_...|>",
297
- lines=4,
298
- info=t("generation.lm_codes_sample_info", n=8),
299
- )
300
 
301
  # Repainting controls
302
  with gr.Group(visible=False) as repainting_group:
@@ -541,6 +478,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
541
  info=t("generation.auto_score_info"),
542
  scale=1,
543
  )
 
 
 
 
 
 
544
  lm_batch_chunk_size = gr.Number(
545
  label=t("generation.lm_batch_chunk_label"),
546
  value=8,
@@ -581,27 +524,30 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
581
  # Set generate_btn to interactive if service is pre-initialized
582
  generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
583
  with gr.Row(equal_height=True):
584
- think_checkbox = gr.Checkbox(
585
- label=t("generation.think_label"),
586
- value=True,
587
- scale=1,
588
- )
589
- allow_lm_batch = gr.Checkbox(
590
- label=t("generation.parallel_thinking_label"),
591
- value=True,
592
- scale=1,
593
- )
594
- generate_btn = gr.Button(t("generation.generate_btn"), variant="primary", size="lg", interactive=generate_btn_interactive, scale=9)
595
- autogen_checkbox = gr.Checkbox(
596
- label=t("generation.autogen_label"),
597
- value=True,
598
- scale=1,
599
- )
600
- use_cot_caption = gr.Checkbox(
601
- label=t("generation.caption_rewrite_label"),
602
- value=True,
603
- scale=1,
604
- )
 
 
 
605
 
606
  return {
607
  "service_config_accordion": service_config_accordion,
@@ -669,25 +615,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
669
  "score_scale": score_scale,
670
  "allow_lm_batch": allow_lm_batch,
671
  "auto_score": auto_score,
 
672
  "lm_batch_chunk_size": lm_batch_chunk_size,
673
- "codes_single_row": codes_single_row,
674
- "codes_batch_row": codes_batch_row,
675
- "codes_batch_row_2": codes_batch_row_2,
676
- "text2music_audio_code_string_1": text2music_audio_code_string_1,
677
- "text2music_audio_code_string_2": text2music_audio_code_string_2,
678
- "text2music_audio_code_string_3": text2music_audio_code_string_3,
679
- "text2music_audio_code_string_4": text2music_audio_code_string_4,
680
- "text2music_audio_code_string_5": text2music_audio_code_string_5,
681
- "text2music_audio_code_string_6": text2music_audio_code_string_6,
682
- "text2music_audio_code_string_7": text2music_audio_code_string_7,
683
- "text2music_audio_code_string_8": text2music_audio_code_string_8,
684
- "codes_col_1": codes_col_1,
685
- "codes_col_2": codes_col_2,
686
- "codes_col_3": codes_col_3,
687
- "codes_col_4": codes_col_4,
688
- "codes_col_5": codes_col_5,
689
- "codes_col_6": codes_col_6,
690
- "codes_col_7": codes_col_7,
691
- "codes_col_8": codes_col_8,
692
  }
693
 
 
218
  size="sm"
219
  )
220
 
221
+ # Audio Codes for text2music - single input for transcription or cover task
222
  with gr.Accordion(t("generation.lm_codes_hints"), open=False, visible=True) as text2music_audio_codes_group:
223
+ with gr.Row(equal_height=True):
 
224
  text2music_audio_code_string = gr.Textbox(
225
  label=t("generation.lm_codes_label"),
226
  placeholder=t("generation.lm_codes_placeholder"),
 
234
  size="sm",
235
  scale=1,
236
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  # Repainting controls
239
  with gr.Group(visible=False) as repainting_group:
 
478
  info=t("generation.auto_score_info"),
479
  scale=1,
480
  )
481
+ auto_lrc = gr.Checkbox(
482
+ label=t("generation.auto_lrc_label"),
483
+ value=False,
484
+ info=t("generation.auto_lrc_info"),
485
+ scale=1,
486
+ )
487
  lm_batch_chunk_size = gr.Number(
488
  label=t("generation.lm_batch_chunk_label"),
489
  value=8,
 
524
  # Set generate_btn to interactive if service is pre-initialized
525
  generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
526
  with gr.Row(equal_height=True):
527
+ with gr.Column(scale=1, variant="compact"):
528
+ think_checkbox = gr.Checkbox(
529
+ label=t("generation.think_label"),
530
+ value=True,
531
+ scale=1,
532
+ )
533
+ allow_lm_batch = gr.Checkbox(
534
+ label=t("generation.parallel_thinking_label"),
535
+ value=True,
536
+ scale=1,
537
+ )
538
+ with gr.Column(scale=18):
539
+ generate_btn = gr.Button(t("generation.generate_btn"), variant="primary", size="lg", interactive=generate_btn_interactive)
540
+ with gr.Column(scale=1, variant="compact"):
541
+ autogen_checkbox = gr.Checkbox(
542
+ label=t("generation.autogen_label"),
543
+ value=True,
544
+ scale=1,
545
+ )
546
+ use_cot_caption = gr.Checkbox(
547
+ label=t("generation.caption_rewrite_label"),
548
+ value=True,
549
+ scale=1,
550
+ )
551
 
552
  return {
553
  "service_config_accordion": service_config_accordion,
 
615
  "score_scale": score_scale,
616
  "allow_lm_batch": allow_lm_batch,
617
  "auto_score": auto_score,
618
+ "auto_lrc": auto_lrc,
619
  "lm_batch_chunk_size": lm_batch_chunk_size,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  }
621
 
acestep/gradio_ui/interfaces/result.py CHANGED
@@ -29,7 +29,7 @@ def create_results_section(dit_handler) -> dict:
29
  label=t("results.generated_music", n=1),
30
  type="filepath",
31
  interactive=False,
32
- show_download_button=False
33
  )
34
  with gr.Row(equal_height=True):
35
  send_to_src_btn_1 = gr.Button(
@@ -57,15 +57,25 @@ def create_results_section(dit_handler) -> dict:
57
  scale=1
58
  )
59
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_1:
 
 
 
 
 
 
 
60
  score_display_1 = gr.Textbox(
61
  label=t("results.quality_score_label", n=1),
62
  interactive=False,
 
 
63
  visible=False
64
  )
65
  lrc_display_1 = gr.Textbox(
66
  label=t("results.lrc_label", n=1),
67
- interactive=False,
68
- lines=8,
 
69
  visible=False
70
  )
71
  with gr.Column(visible=True) as audio_col_2:
@@ -73,7 +83,7 @@ def create_results_section(dit_handler) -> dict:
73
  label=t("results.generated_music", n=2),
74
  type="filepath",
75
  interactive=False,
76
- show_download_button=False
77
  )
78
  with gr.Row(equal_height=True):
79
  send_to_src_btn_2 = gr.Button(
@@ -101,15 +111,25 @@ def create_results_section(dit_handler) -> dict:
101
  scale=1
102
  )
103
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_2:
 
 
 
 
 
 
 
104
  score_display_2 = gr.Textbox(
105
  label=t("results.quality_score_label", n=2),
106
  interactive=False,
 
 
107
  visible=False
108
  )
109
  lrc_display_2 = gr.Textbox(
110
  label=t("results.lrc_label", n=2),
111
- interactive=False,
112
- lines=8,
 
113
  visible=False
114
  )
115
  with gr.Column(visible=False) as audio_col_3:
@@ -117,7 +137,7 @@ def create_results_section(dit_handler) -> dict:
117
  label=t("results.generated_music", n=3),
118
  type="filepath",
119
  interactive=False,
120
- show_download_button=False
121
  )
122
  with gr.Row(equal_height=True):
123
  send_to_src_btn_3 = gr.Button(
@@ -145,15 +165,25 @@ def create_results_section(dit_handler) -> dict:
145
  scale=1
146
  )
147
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_3:
 
 
 
 
 
 
 
148
  score_display_3 = gr.Textbox(
149
  label=t("results.quality_score_label", n=3),
150
  interactive=False,
 
 
151
  visible=False
152
  )
153
  lrc_display_3 = gr.Textbox(
154
  label=t("results.lrc_label", n=3),
155
- interactive=False,
156
- lines=8,
 
157
  visible=False
158
  )
159
  with gr.Column(visible=False) as audio_col_4:
@@ -161,7 +191,7 @@ def create_results_section(dit_handler) -> dict:
161
  label=t("results.generated_music", n=4),
162
  type="filepath",
163
  interactive=False,
164
- show_download_button=False
165
  )
166
  with gr.Row(equal_height=True):
167
  send_to_src_btn_4 = gr.Button(
@@ -189,15 +219,25 @@ def create_results_section(dit_handler) -> dict:
189
  scale=1
190
  )
191
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_4:
 
 
 
 
 
 
 
192
  score_display_4 = gr.Textbox(
193
  label=t("results.quality_score_label", n=4),
194
  interactive=False,
 
 
195
  visible=False
196
  )
197
  lrc_display_4 = gr.Textbox(
198
  label=t("results.lrc_label", n=4),
199
- interactive=False,
200
- lines=8,
 
201
  visible=False
202
  )
203
 
@@ -208,7 +248,7 @@ def create_results_section(dit_handler) -> dict:
208
  label=t("results.generated_music", n=5),
209
  type="filepath",
210
  interactive=False,
211
- show_download_button=False
212
  )
213
  with gr.Row(equal_height=True):
214
  send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -216,15 +256,25 @@ def create_results_section(dit_handler) -> dict:
216
  score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
217
  lrc_btn_5 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
218
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_5:
 
 
 
 
 
 
 
219
  score_display_5 = gr.Textbox(
220
  label=t("results.quality_score_label", n=5),
221
  interactive=False,
 
 
222
  visible=False
223
  )
224
  lrc_display_5 = gr.Textbox(
225
  label=t("results.lrc_label", n=5),
226
- interactive=False,
227
- lines=8,
 
228
  visible=False
229
  )
230
  with gr.Column() as audio_col_6:
@@ -232,7 +282,7 @@ def create_results_section(dit_handler) -> dict:
232
  label=t("results.generated_music", n=6),
233
  type="filepath",
234
  interactive=False,
235
- show_download_button=False
236
  )
237
  with gr.Row(equal_height=True):
238
  send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -240,15 +290,25 @@ def create_results_section(dit_handler) -> dict:
240
  score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
241
  lrc_btn_6 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
242
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_6:
 
 
 
 
 
 
 
243
  score_display_6 = gr.Textbox(
244
  label=t("results.quality_score_label", n=6),
245
  interactive=False,
 
 
246
  visible=False
247
  )
248
  lrc_display_6 = gr.Textbox(
249
  label=t("results.lrc_label", n=6),
250
- interactive=False,
251
- lines=8,
 
252
  visible=False
253
  )
254
  with gr.Column() as audio_col_7:
@@ -256,7 +316,7 @@ def create_results_section(dit_handler) -> dict:
256
  label=t("results.generated_music", n=7),
257
  type="filepath",
258
  interactive=False,
259
- show_download_button=False
260
  )
261
  with gr.Row(equal_height=True):
262
  send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -264,15 +324,25 @@ def create_results_section(dit_handler) -> dict:
264
  score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
265
  lrc_btn_7 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
266
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_7:
 
 
 
 
 
 
 
267
  score_display_7 = gr.Textbox(
268
  label=t("results.quality_score_label", n=7),
269
  interactive=False,
 
 
270
  visible=False
271
  )
272
  lrc_display_7 = gr.Textbox(
273
  label=t("results.lrc_label", n=7),
274
- interactive=False,
275
- lines=8,
 
276
  visible=False
277
  )
278
  with gr.Column() as audio_col_8:
@@ -280,7 +350,7 @@ def create_results_section(dit_handler) -> dict:
280
  label=t("results.generated_music", n=8),
281
  type="filepath",
282
  interactive=False,
283
- show_download_button=False
284
  )
285
  with gr.Row(equal_height=True):
286
  send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -288,15 +358,25 @@ def create_results_section(dit_handler) -> dict:
288
  score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
289
  lrc_btn_8 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
290
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_8:
 
 
 
 
 
 
 
291
  score_display_8 = gr.Textbox(
292
  label=t("results.quality_score_label", n=8),
293
  interactive=False,
 
 
294
  visible=False
295
  )
296
  lrc_display_8 = gr.Textbox(
297
  label=t("results.lrc_label", n=8),
298
- interactive=False,
299
- lines=8,
 
300
  visible=False
301
  )
302
 
@@ -410,6 +490,14 @@ def create_results_section(dit_handler) -> dict:
410
  "score_display_6": score_display_6,
411
  "score_display_7": score_display_7,
412
  "score_display_8": score_display_8,
 
 
 
 
 
 
 
 
413
  "lrc_btn_1": lrc_btn_1,
414
  "lrc_btn_2": lrc_btn_2,
415
  "lrc_btn_3": lrc_btn_3,
 
29
  label=t("results.generated_music", n=1),
30
  type="filepath",
31
  interactive=False,
32
+ buttons=[]
33
  )
34
  with gr.Row(equal_height=True):
35
  send_to_src_btn_1 = gr.Button(
 
57
  scale=1
58
  )
59
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_1:
60
+ codes_display_1 = gr.Textbox(
61
+ label=t("results.codes_label", n=1),
62
+ interactive=False,
63
+ buttons=["copy"],
64
+ max_lines=4,
65
+ visible=False
66
+ )
67
  score_display_1 = gr.Textbox(
68
  label=t("results.quality_score_label", n=1),
69
  interactive=False,
70
+ buttons=["copy"],
71
+ max_lines=6,
72
  visible=False
73
  )
74
  lrc_display_1 = gr.Textbox(
75
  label=t("results.lrc_label", n=1),
76
+ interactive=True,
77
+ buttons=["copy"],
78
+ max_lines=8,
79
  visible=False
80
  )
81
  with gr.Column(visible=True) as audio_col_2:
 
83
  label=t("results.generated_music", n=2),
84
  type="filepath",
85
  interactive=False,
86
+ buttons=[]
87
  )
88
  with gr.Row(equal_height=True):
89
  send_to_src_btn_2 = gr.Button(
 
111
  scale=1
112
  )
113
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_2:
114
+ codes_display_2 = gr.Textbox(
115
+ label=t("results.codes_label", n=2),
116
+ interactive=False,
117
+ buttons=["copy"],
118
+ max_lines=4,
119
+ visible=False
120
+ )
121
  score_display_2 = gr.Textbox(
122
  label=t("results.quality_score_label", n=2),
123
  interactive=False,
124
+ buttons=["copy"],
125
+ max_lines=6,
126
  visible=False
127
  )
128
  lrc_display_2 = gr.Textbox(
129
  label=t("results.lrc_label", n=2),
130
+ interactive=True,
131
+ buttons=["copy"],
132
+ max_lines=8,
133
  visible=False
134
  )
135
  with gr.Column(visible=False) as audio_col_3:
 
137
  label=t("results.generated_music", n=3),
138
  type="filepath",
139
  interactive=False,
140
+ buttons=[]
141
  )
142
  with gr.Row(equal_height=True):
143
  send_to_src_btn_3 = gr.Button(
 
165
  scale=1
166
  )
167
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_3:
168
+ codes_display_3 = gr.Textbox(
169
+ label=t("results.codes_label", n=3),
170
+ interactive=False,
171
+ buttons=["copy"],
172
+ max_lines=4,
173
+ visible=False
174
+ )
175
  score_display_3 = gr.Textbox(
176
  label=t("results.quality_score_label", n=3),
177
  interactive=False,
178
+ buttons=["copy"],
179
+ max_lines=6,
180
  visible=False
181
  )
182
  lrc_display_3 = gr.Textbox(
183
  label=t("results.lrc_label", n=3),
184
+ interactive=True,
185
+ buttons=["copy"],
186
+ max_lines=8,
187
  visible=False
188
  )
189
  with gr.Column(visible=False) as audio_col_4:
 
191
  label=t("results.generated_music", n=4),
192
  type="filepath",
193
  interactive=False,
194
+ buttons=[]
195
  )
196
  with gr.Row(equal_height=True):
197
  send_to_src_btn_4 = gr.Button(
 
219
  scale=1
220
  )
221
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_4:
222
+ codes_display_4 = gr.Textbox(
223
+ label=t("results.codes_label", n=4),
224
+ interactive=False,
225
+ buttons=["copy"],
226
+ max_lines=4,
227
+ visible=False
228
+ )
229
  score_display_4 = gr.Textbox(
230
  label=t("results.quality_score_label", n=4),
231
  interactive=False,
232
+ buttons=["copy"],
233
+ max_lines=6,
234
  visible=False
235
  )
236
  lrc_display_4 = gr.Textbox(
237
  label=t("results.lrc_label", n=4),
238
+ interactive=True,
239
+ buttons=["copy"],
240
+ max_lines=8,
241
  visible=False
242
  )
243
 
 
248
  label=t("results.generated_music", n=5),
249
  type="filepath",
250
  interactive=False,
251
+ buttons=[]
252
  )
253
  with gr.Row(equal_height=True):
254
  send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
 
256
  score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
257
  lrc_btn_5 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
258
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_5:
259
+ codes_display_5 = gr.Textbox(
260
+ label=t("results.codes_label", n=5),
261
+ interactive=False,
262
+ buttons=["copy"],
263
+ max_lines=4,
264
+ visible=False
265
+ )
266
  score_display_5 = gr.Textbox(
267
  label=t("results.quality_score_label", n=5),
268
  interactive=False,
269
+ buttons=["copy"],
270
+ max_lines=6,
271
  visible=False
272
  )
273
  lrc_display_5 = gr.Textbox(
274
  label=t("results.lrc_label", n=5),
275
+ interactive=True,
276
+ buttons=["copy"],
277
+ max_lines=8,
278
  visible=False
279
  )
280
  with gr.Column() as audio_col_6:
 
282
  label=t("results.generated_music", n=6),
283
  type="filepath",
284
  interactive=False,
285
+ buttons=[]
286
  )
287
  with gr.Row(equal_height=True):
288
  send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
 
290
  score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
291
  lrc_btn_6 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
292
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_6:
293
+ codes_display_6 = gr.Textbox(
294
+ label=t("results.codes_label", n=6),
295
+ interactive=False,
296
+ buttons=["copy"],
297
+ max_lines=4,
298
+ visible=False
299
+ )
300
  score_display_6 = gr.Textbox(
301
  label=t("results.quality_score_label", n=6),
302
  interactive=False,
303
+ buttons=["copy"],
304
+ max_lines=6,
305
  visible=False
306
  )
307
  lrc_display_6 = gr.Textbox(
308
  label=t("results.lrc_label", n=6),
309
+ interactive=True,
310
+ buttons=["copy"],
311
+ max_lines=8,
312
  visible=False
313
  )
314
  with gr.Column() as audio_col_7:
 
316
  label=t("results.generated_music", n=7),
317
  type="filepath",
318
  interactive=False,
319
+ buttons=[]
320
  )
321
  with gr.Row(equal_height=True):
322
  send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
 
324
  score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
325
  lrc_btn_7 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
326
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_7:
327
+ codes_display_7 = gr.Textbox(
328
+ label=t("results.codes_label", n=7),
329
+ interactive=False,
330
+ buttons=["copy"],
331
+ max_lines=4,
332
+ visible=False
333
+ )
334
  score_display_7 = gr.Textbox(
335
  label=t("results.quality_score_label", n=7),
336
  interactive=False,
337
+ buttons=["copy"],
338
+ max_lines=6,
339
  visible=False
340
  )
341
  lrc_display_7 = gr.Textbox(
342
  label=t("results.lrc_label", n=7),
343
+ interactive=True,
344
+ buttons=["copy"],
345
+ max_lines=8,
346
  visible=False
347
  )
348
  with gr.Column() as audio_col_8:
 
350
  label=t("results.generated_music", n=8),
351
  type="filepath",
352
  interactive=False,
353
+ buttons=[]
354
  )
355
  with gr.Row(equal_height=True):
356
  send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
 
358
  score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
359
  lrc_btn_8 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
360
  with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_8:
361
+ codes_display_8 = gr.Textbox(
362
+ label=t("results.codes_label", n=8),
363
+ interactive=False,
364
+ buttons=["copy"],
365
+ max_lines=4,
366
+ visible=False
367
+ )
368
  score_display_8 = gr.Textbox(
369
  label=t("results.quality_score_label", n=8),
370
  interactive=False,
371
+ buttons=["copy"],
372
+ max_lines=6,
373
  visible=False
374
  )
375
  lrc_display_8 = gr.Textbox(
376
  label=t("results.lrc_label", n=8),
377
+ interactive=True,
378
+ buttons=["copy"],
379
+ max_lines=8,
380
  visible=False
381
  )
382
 
 
490
  "score_display_6": score_display_6,
491
  "score_display_7": score_display_7,
492
  "score_display_8": score_display_8,
493
+ "codes_display_1": codes_display_1,
494
+ "codes_display_2": codes_display_2,
495
+ "codes_display_3": codes_display_3,
496
+ "codes_display_4": codes_display_4,
497
+ "codes_display_5": codes_display_5,
498
+ "codes_display_6": codes_display_6,
499
+ "codes_display_7": codes_display_7,
500
+ "codes_display_8": codes_display_8,
501
  "lrc_btn_1": lrc_btn_1,
502
  "lrc_btn_2": lrc_btn_2,
503
  "lrc_btn_3": lrc_btn_3,
acestep/llm_inference.py CHANGED
@@ -773,7 +773,12 @@ class LLMHandler:
773
  cot_items = {}
774
  for key in ['bpm', 'caption', 'duration', 'keyscale', 'language', 'timesignature']:
775
  if key in metadata and metadata[key] is not None:
776
- cot_items[key] = metadata[key]
 
 
 
 
 
777
 
778
  # Format as YAML (sorted keys, unicode support)
779
  if len(cot_items) > 0:
 
773
  cot_items = {}
774
  for key in ['bpm', 'caption', 'duration', 'keyscale', 'language', 'timesignature']:
775
  if key in metadata and metadata[key] is not None:
776
+ value = metadata[key]
777
+ if key == "timesignature" and value.endswith("/4"):
778
+ value = value.split("/")[0]
779
+ if isinstance(value, str) and value.isdigit():
780
+ value = int(value)
781
+ cot_items[key] = value
782
 
783
  # Format as YAML (sorted keys, unicode support)
784
  if len(cot_items) > 0: