Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Jan 10

Commit

03f73c6

1 Parent(s): 11860f1

fix bugs and test profile

Browse files

Files changed (10) hide show

acestep/audio_utils.py +3 -76
acestep/gradio_ui/events/__init__.py +3 -2
acestep/gradio_ui/events/results_handlers.py +345 -381
acestep/gradio_ui/interfaces/result.py +16 -8
acestep/handler.py +3 -25
acestep/inference.py +234 -354
acestep/llm_inference.py +117 -26
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +35 -23
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +10 -24
profile_inference.py +621 -162

acestep/audio_utils.py CHANGED Viewed

@@ -98,7 +98,6 @@ class AudioSaver:
                     channels_first=True,
                     backend='ffmpeg',
                     compression=config,
-                    buffer_size=65536
                 )
             elif format in ["flac", "wav"]:
                 # FLAC and WAV use soundfile backend (fastest)
@@ -107,8 +106,7 @@ class AudioSaver:
                     audio_tensor,
                     sample_rate,
                     channels_first=True,
-                    backend='soundfile',
-                    buffer_size=65536
                 )
             else:
                 # Other formats use default backend
@@ -117,7 +115,6 @@ class AudioSaver:
                     audio_tensor,
                     sample_rate,
                     channels_first=True,
-                    buffer_size=65536
                 )
             logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
@@ -247,87 +244,17 @@ def get_audio_file_hash(audio_file) -> str:
         return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
-def generate_uuid_from_params(
-    captions: str,
-    lyrics: str,
-    bpm: Optional[int],
-    key_scale: str,
-    time_signature: str,
-    vocal_language: str,
-    inference_steps: int,
-    guidance_scale: float,
-    seed: Union[str, float, int],
-    audio_duration: Optional[float],
-    audio_code_string: Union[str, List[str]],
-    repainting_start: float,
-    repainting_end: Optional[float],
-    instruction: str,
-    audio_cover_strength: float,
-    task_type: str,
-    use_adg: bool,
-    cfg_interval_start: float,
-    cfg_interval_end: float,
-    audio_format: str,
-    reference_audio=None,
-    src_audio=None,
-    batch_index: int = 0,
-) -> str:
     """
     Generate deterministic UUID from generation parameters.
     Same parameters will always generate the same UUID.
     Args:
-        captions: Music caption
-        lyrics: Lyrics text
-        bpm: BPM value
-        key_scale: Musical key and scale
-        time_signature: Time signature
-        vocal_language: Vocal language code
-        inference_steps: Number of inference steps
-        guidance_scale: Guidance scale
-        seed: Random seed
-        audio_duration: Audio duration in seconds
-        audio_code_string: Audio code string or list
-        repainting_start: Repainting start time
-        repainting_end: Repainting end time
-        instruction: Task instruction
-        audio_cover_strength: Audio cover strength
-        task_type: Task type
-        use_adg: Whether to use ADG
-        cfg_interval_start: CFG interval start
-        cfg_interval_end: CFG interval end
-        audio_format: Audio format
-        reference_audio: Reference audio file path
-        src_audio: Source audio file path
-        batch_index: Index in batch (for audio_code_string list access)
     Returns:
         UUID string
     """
-    params_dict = {
-        "captions": captions or "",
-        "lyrics": lyrics or "",
-        "bpm": bpm,
-        "key_scale": key_scale or "",
-        "time_signature": time_signature or "",
-        "vocal_language": vocal_language or "",
-        "inference_steps": inference_steps,
-        "guidance_scale": guidance_scale,
-        "seed": seed,
-        "audio_duration": audio_duration,
-        "audio_code_string": audio_code_string if isinstance(audio_code_string, str) else (audio_code_string[batch_index] if isinstance(audio_code_string, list) and batch_index < len(audio_code_string) else ""),
-        "repainting_start": repainting_start,
-        "repainting_end": repainting_end,
-        "instruction": instruction or "",
-        "audio_cover_strength": audio_cover_strength,
-        "task_type": task_type or "",
-        "use_adg": use_adg,
-        "cfg_interval_start": cfg_interval_start,
-        "cfg_interval_end": cfg_interval_end,
-        "audio_format": audio_format or "",
-        "reference_audio_hash": get_audio_file_hash(reference_audio),
-        "src_audio_hash": get_audio_file_hash(src_audio),
-    }
     params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
     hash_obj = hashlib.sha256(params_json.encode('utf-8'))

                     channels_first=True,
                     backend='ffmpeg',
                     compression=config,
                 )
             elif format in ["flac", "wav"]:
                 # FLAC and WAV use soundfile backend (fastest)
                     audio_tensor,
                     sample_rate,
                     channels_first=True,
+                    backend='ffmpeg',
                 )
             else:
                 # Other formats use default backend
                     audio_tensor,
                     sample_rate,
                     channels_first=True,
                 )
             logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
         return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(params_dict) -> str:
     """
     Generate deterministic UUID from generation parameters.
     Same parameters will always generate the same UUID.
     Args:
+        params_dict: Dictionary of parameters
     Returns:
         UUID string
     """
     params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
     hash_obj = hashlib.sha256(params_json.encode('utf-8'))

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -331,10 +331,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             ],
             outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
         )
     # ========== Generation Handler ==========
     generation_section["generate_btn"].click(
-        fn=lambda *args: res_h.generate_with_batch_management(dit_handler, llm_handler, *args),
         inputs=[
             generation_section["captions"],
             generation_section["lyrics"],

             ],
             outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
         )
+    def generation_wrapper(*args):
+        yield from res_h.generate_with_batch_management(dit_handler, llm_handler, *args)
     # ========== Generation Handler ==========
     generation_section["generate_btn"].click(
+        fn=generation_wrapper,
         inputs=[
             generation_section["captions"],
             generation_section["lyrics"],

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -10,9 +10,123 @@ import tempfile
 import shutil
 import zipfile
 import time as time_module
 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
 def store_batch_in_queue(
@@ -254,383 +368,205 @@ def generate_with_progress(
     auto_score,
     score_scale,
     lm_batch_chunk_size,
-    progress=gr.Progress(track_tqdm=True)
 ):
     """Generate audio with progress tracking"""
-    # If think is enabled (llm_dit mode) and use_cot_metas is True, generate audio codes using LM first
-    audio_code_string_to_use = text2music_audio_code_string
-    lm_generated_metadata = None  # Store LM-generated metadata for display
-    lm_generated_audio_codes = None  # Store LM-generated audio codes for display
-    lm_generated_audio_codes_list = []  # Store list of audio codes for batch processing
-    # Determine if we should use batch LM generation
-    should_use_lm_batch = (
-        think_checkbox and
-        llm_handler.llm_initialized and
-        use_cot_metas and
-        allow_lm_batch and
-        batch_size_input >= 2
     )
-    if think_checkbox and llm_handler.llm_initialized and use_cot_metas:
-        # Convert top_k: 0 means None (disabled)
-        top_k_value = None if lm_top_k == 0 else int(lm_top_k)
-        # Convert top_p: 1.0 means None (disabled)
-        top_p_value = None if lm_top_p >= 1.0 else lm_top_p
-        # Build user_metadata from user-provided values (only include non-empty values)
-        user_metadata = {}
-        # Handle bpm: gr.Number can be None, int, float, or string
-        if bpm is not None:
-            try:
-                bpm_value = float(bpm)
-                if bpm_value > 0:
-                    user_metadata['bpm'] = str(int(bpm_value))
-            except (ValueError, TypeError):
-                # If bpm is not a valid number, skip it
-                pass
-        if key_scale and key_scale.strip():
-            key_scale_clean = key_scale.strip()
-            if key_scale_clean.lower() not in ["n/a", ""]:
-                user_metadata['keyscale'] = key_scale_clean
-        if time_signature and time_signature.strip():
-            time_sig_clean = time_signature.strip()
-            if time_sig_clean.lower() not in ["n/a", ""]:
-                user_metadata['timesignature'] = time_sig_clean
-        if audio_duration is not None:
-            try:
-                duration_value = float(audio_duration)
-                if duration_value > 0:
-                    user_metadata['duration'] = str(int(duration_value))
-            except (ValueError, TypeError):
-                # If audio_duration is not a valid number, skip it
-                pass
-        # Only pass user_metadata if user provided any values, otherwise let LM generate
-        user_metadata_to_pass = user_metadata if user_metadata else None
-        if should_use_lm_batch:
-            # BATCH LM GENERATION
-            logger.info(f"Using LM batch generation for {batch_size_input} items...")
-            # Prepare seeds for batch items
-            actual_seed_list, _ = dit_handler.prepare_seeds(batch_size_input, seed, random_seed_checkbox)
-            # Split batch into chunks (GPU memory constraint)
-            max_inference_batch_size = int(lm_batch_chunk_size)
-            num_chunks = math.ceil(batch_size_input / max_inference_batch_size)
-            all_metadata_list = []
-            all_audio_codes_list = []
-            for chunk_idx in range(num_chunks):
-                chunk_start = chunk_idx * max_inference_batch_size
-                chunk_end = min(chunk_start + max_inference_batch_size, batch_size_input)
-                chunk_size = chunk_end - chunk_start
-                chunk_seeds = actual_seed_list[chunk_start:chunk_end]
-                logger.info(f"Generating LM batch chunk {chunk_idx+1}/{num_chunks} (size: {chunk_size}, seeds: {chunk_seeds})...")
-                # Generate batch
-                metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition(
-                    caption=captions or "",
-                    lyrics=lyrics or "",
-                    infer_type="llm_dit",
-                    temperature=lm_temperature,
-                    cfg_scale=lm_cfg_scale,
-                    negative_prompt=lm_negative_prompt,
-                    top_k=top_k_value,
-                    top_p=top_p_value,
-                    user_metadata=user_metadata_to_pass,
-                    use_cot_caption=use_cot_caption,
-                    use_cot_language=use_cot_language,
-                    is_format_caption=is_format_caption,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    batch_size=chunk_size,
-                    seeds=chunk_seeds,
-                )
-                all_metadata_list.extend(metadata_list)
-                all_audio_codes_list.extend(audio_codes_list)
-            # Use first metadata as representative (all are same)
-            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
-            # Store audio codes list for later use
-            lm_generated_audio_codes_list = all_audio_codes_list
-            # Prepare audio codes for DiT (list of codes, one per batch item)
-            audio_code_string_to_use = all_audio_codes_list
-            # Update metadata fields from LM if not provided by user
-            if lm_generated_metadata:
-                if bpm is None and lm_generated_metadata.get('bpm'):
-                    bpm_value = lm_generated_metadata.get('bpm')
-                    if bpm_value != "N/A" and bpm_value != "":
-                        try:
-                            bpm = int(bpm_value)
-                        except:
-                            pass
-                if not key_scale and lm_generated_metadata.get('keyscale'):
-                    key_scale_value = lm_generated_metadata.get('keyscale', lm_generated_metadata.get('key_scale', ""))
-                    if key_scale_value != "N/A":
-                        key_scale = key_scale_value
-                if not time_signature and lm_generated_metadata.get('timesignature'):
-                    time_signature_value = lm_generated_metadata.get('timesignature', lm_generated_metadata.get('time_signature', ""))
-                    if time_signature_value != "N/A":
-                        time_signature = time_signature_value
-                if audio_duration is None or audio_duration <= 0:
-                    audio_duration_value = lm_generated_metadata.get('duration', -1)
-                    if audio_duration_value != "N/A" and audio_duration_value != "":
-                        try:
-                            audio_duration = float(audio_duration_value)
-                        except:
-                            pass
-        else:
-            # SEQUENTIAL LM GENERATION (current behavior, when allow_lm_batch is False)
-            # Phase 1: Generate CoT metadata
-            phase1_start = time_module.time()
-            metadata, _, status = llm_handler.generate_with_stop_condition(
-                caption=captions or "",
-                lyrics=lyrics or "",
-                infer_type="dit",  # Only generate metadata in Phase 1
-                temperature=lm_temperature,
-                cfg_scale=lm_cfg_scale,
-                negative_prompt=lm_negative_prompt,
-                top_k=top_k_value,
-                top_p=top_p_value,
-                user_metadata=user_metadata_to_pass,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
-                constrained_decoding_debug=constrained_decoding_debug,
-            )
-            lm_phase1_time = time_module.time() - phase1_start
-            logger.info(f"LM Phase 1 (CoT) completed in {lm_phase1_time:.2f}s")
-            # Phase 2: Generate audio codes
-            phase2_start = time_module.time()
-            metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
-                caption=captions or "",
-                lyrics=lyrics or "",
-                infer_type="llm_dit",  # Generate both metadata and codes
-                temperature=lm_temperature,
-                cfg_scale=lm_cfg_scale,
-                negative_prompt=lm_negative_prompt,
-                top_k=top_k_value,
-                top_p=top_p_value,
-                user_metadata=user_metadata_to_pass,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
-                constrained_decoding_debug=constrained_decoding_debug,
             )
-            lm_phase2_time = time_module.time() - phase2_start
-            logger.info(f"LM Phase 2 (Codes) completed in {lm_phase2_time:.2f}s")
-            # Store LM-generated metadata and audio codes for display
-            lm_generated_metadata = metadata
-            if audio_codes:
-                audio_code_string_to_use = audio_codes
-                lm_generated_audio_codes = audio_codes
-                # Update metadata fields only if they are empty/None (user didn't provide them)
-                if bpm is None and metadata.get('bpm'):
-                    bpm_value = metadata.get('bpm')
-                    if bpm_value != "N/A" and bpm_value != "":
-                        try:
-                            bpm = int(bpm_value)
-                        except:
-                            pass
-                if not key_scale and metadata.get('keyscale'):
-                    key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
-                    if key_scale_value != "N/A":
-                        key_scale = key_scale_value
-                if not time_signature and metadata.get('timesignature'):
-                    time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
-                    if time_signature_value != "N/A":
-                        time_signature = time_signature_value
-                if audio_duration is None or audio_duration <= 0:
-                    audio_duration_value = metadata.get('duration', -1)
-                    if audio_duration_value != "N/A" and audio_duration_value != "":
-                        try:
-                            audio_duration = float(audio_duration_value)
-                        except:
-                            pass
-    # Call generate_music and get results
-    result = dit_handler.generate_music(
-        captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
-        time_signature=time_signature, vocal_language=vocal_language,
-        inference_steps=inference_steps, guidance_scale=guidance_scale,
-        use_random_seed=random_seed_checkbox, seed=seed,
-        reference_audio=reference_audio, audio_duration=audio_duration,
-        batch_size=batch_size_input, src_audio=src_audio,
-        audio_code_string=audio_code_string_to_use,
-        repainting_start=repainting_start, repainting_end=repainting_end,
-        instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
-        task_type=task_type, use_adg=use_adg,
-        cfg_interval_start=cfg_interval_start, cfg_interval_end=cfg_interval_end,
-        audio_format=audio_format, lm_temperature=lm_temperature,
-        progress=progress
-    )
-    # Extract results from new dict structure
-    if not isinstance(result, dict):
-        # Fallback for old tuple format (should not happen)
-        first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
-            align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
-    else:
-        audios = result.get("audios", [])
-        all_audio_paths = [audio.get("path") for audio in audios]
-        first_audio = all_audio_paths[0] if len(all_audio_paths) > 0 else None
-        second_audio = all_audio_paths[1] if len(all_audio_paths) > 1 else None
-        generation_info = result.get("generation_info", "")
-        status_message = result.get("status_message", "")
-        seed_value_for_ui = result.get("extra_outputs", {}).get("seed_value", "")
-        # Legacy alignment fields (no longer used)
-        align_score_1 = ""
-        align_text_1 = ""
-        align_plot_1 = None
-        align_score_2 = ""
-        align_text_2 = ""
-        align_plot_2 = None
-    # Extract LM timing from status if available and prepend to generation_info
-    if status:
-        import re
-        # Try to extract timing info from status using regex
-        # Expected format: "Phase1: X.XXs" and "Phase2: X.XXs"
-        phase1_match = re.search(r'Phase1:\s*([\d.]+)s', status)
-        phase2_match = re.search(r'Phase2:\s*([\d.]+)s', status)
-        if phase1_match or phase2_match:
-            lm_timing_section = "\n\n**🤖 LM Timing:**\n"
-            lm_total = 0.0
-            if phase1_match:
-                phase1_time = float(phase1_match.group(1))
-                lm_timing_section += f"  - Phase 1 (CoT Metadata): {phase1_time:.2f}s\n"
-                lm_total += phase1_time
-            if phase2_match:
-                phase2_time = float(phase2_match.group(1))
-                lm_timing_section += f"  - Phase 2 (Audio Codes): {phase2_time:.2f}s\n"
-                lm_total += phase2_time
-            if lm_total > 0:
-                lm_timing_section += f"  - Total LM Time: {lm_total:.2f}s\n"
-            generation_info = lm_timing_section + "\n" + generation_info
-    # Append LM-generated metadata to generation_info if available
-    if lm_generated_metadata:
-        metadata_lines = []
-        if lm_generated_metadata.get('bpm'):
-            metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
-        if lm_generated_metadata.get('caption'):
-            metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
-        if lm_generated_metadata.get('duration'):
-            metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
-        if lm_generated_metadata.get('keyscale'):
-            metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
-        if lm_generated_metadata.get('language'):
-            metadata_lines.append(f"- **Language:** {lm_generated_metadata['language']}")
-        if lm_generated_metadata.get('timesignature'):
-            metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
-        if metadata_lines:
-            metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
-            generation_info = metadata_section + "\n\n" + generation_info
-    # Update audio codes in UI if LM generated them
-    codes_outputs = [""] * 8  # Codes for 8 components
-    if should_use_lm_batch and lm_generated_audio_codes_list:
-        # Batch mode: update individual codes inputs
-        for idx in range(min(len(lm_generated_audio_codes_list), 8)):
-            codes_outputs[idx] = lm_generated_audio_codes_list[idx]
-        # For single codes input, show first one
-        updated_audio_codes = lm_generated_audio_codes_list[0] if lm_generated_audio_codes_list else text2music_audio_code_string
-    else:
-        # Single mode: update main codes input
-        updated_audio_codes = lm_generated_audio_codes if lm_generated_audio_codes else text2music_audio_code_string
-    # AUTO-SCORING
-    score_displays = [""] * 8  # Scores for 8 components
-    if auto_score and all_audio_paths:
-        logger.info(f"Auto-scoring enabled, calculating quality scores for {batch_size_input} generated audios...")
-        # Determine which audio codes to use for scoring
-        if should_use_lm_batch and lm_generated_audio_codes_list:
-            codes_list = lm_generated_audio_codes_list
-        elif audio_code_string_to_use and isinstance(audio_code_string_to_use, list):
-            codes_list = audio_code_string_to_use
         else:
-            # Single code string, replicate for all audios
-            codes_list = [audio_code_string_to_use] * len(all_audio_paths)
-        # Calculate scores only for actually generated audios (up to batch_size_input)
-        # Don't score beyond the actual batch size to avoid duplicates
-        actual_audios_to_score = min(len(all_audio_paths), int(batch_size_input))
-        for idx in range(actual_audios_to_score):
-            if idx < len(codes_list) and codes_list[idx]:
-                try:
-                    score_display = calculate_score_handler(
-                        llm_handler,
-                        codes_list[idx],
-                        captions,
-                        lyrics,
-                        lm_generated_metadata,
-                        bpm, key_scale, time_signature, audio_duration, vocal_language,
-                        score_scale
-                    )
-                    score_displays[idx] = score_display
-                    logger.info(f"Auto-scored audio {idx+1}")
-                except Exception as e:
-                    logger.error(f"Auto-scoring failed for audio {idx+1}: {e}")
-                    score_displays[idx] = f"❌ Auto-scoring failed: {str(e)}"
-    # Prepare audio outputs (up to 8)
-    audio_outputs = [None] * 8
-    for idx in range(min(len(all_audio_paths), 8)):
-        audio_outputs[idx] = all_audio_paths[idx]
-    return (
-        audio_outputs[0],  # generated_audio_1
-        audio_outputs[1],  # generated_audio_2
-        audio_outputs[2],  # generated_audio_3
-        audio_outputs[3],  # generated_audio_4
-        audio_outputs[4],  # generated_audio_5
-        audio_outputs[5],  # generated_audio_6
-        audio_outputs[6],  # generated_audio_7
-        audio_outputs[7],  # generated_audio_8
-        all_audio_paths,   # generated_audio_batch
         generation_info,
-        status_message,
         seed_value_for_ui,
-        align_score_1,
-        align_text_1,
-        align_plot_1,
-        align_score_2,
-        align_text_2,
-        align_plot_2,
-        score_displays[0],  # score_display_1
-        score_displays[1],  # score_display_2
-        score_displays[2],  # score_display_3
-        score_displays[3],  # score_display_4
-        score_displays[4],  # score_display_5
-        score_displays[5],  # score_display_6
-        score_displays[6],  # score_display_7
-        score_displays[7],  # score_display_8
-        updated_audio_codes,  # Update main audio codes in UI
-        codes_outputs[0],  # text2music_audio_code_string_1
-        codes_outputs[1],  # text2music_audio_code_string_2
-        codes_outputs[2],  # text2music_audio_code_string_3
-        codes_outputs[3],  # text2music_audio_code_string_4
-        codes_outputs[4],  # text2music_audio_code_string_5
-        codes_outputs[5],  # text2music_audio_code_string_6
-        codes_outputs[6],  # text2music_audio_code_string_7
-        codes_outputs[7],  # text2music_audio_code_string_8
-        lm_generated_metadata,  # Store metadata for "Send to src audio" buttons
-        is_format_caption,  # Keep is_format_caption unchanged
     )
 def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
     """
     Calculate PMI-based quality score for generated audio.
@@ -773,7 +709,9 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
     if stored_allow_lm_batch and isinstance(stored_codes, list):
         # Batch mode: use specific sample's codes
         if 0 <= sample_idx - 1 < len(stored_codes):
-            audio_codes_str = stored_codes[sample_idx - 1]
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
@@ -885,7 +823,7 @@ def generate_with_batch_management(
     Wrapper for generate_with_progress that adds batch queue management
     """
     # Call the original generation function
-    result = generate_with_progress(
         dit_handler, llm_handler,
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
         inference_steps, guidance_scale, random_seed_checkbox, seed,
@@ -902,23 +840,41 @@ def generate_with_batch_management(
         lm_batch_chunk_size,
         progress
     )
-    # Extract results from generation
-    all_audio_paths = result[8]  # generated_audio_batch
     generation_info = result[9]
     seed_value_for_ui = result[11]
-    lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
     # Extract codes
     generated_codes_single = result[26]
     generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
         codes_to_store = generated_codes_batch[:int(batch_size_input)]
     else:
         codes_to_store = generated_codes_single
     # Save parameters for history
     saved_params = {
         "captions": captions,
@@ -964,6 +920,7 @@ def generate_with_batch_management(
     }
     # Next batch parameters (with cleared codes & random seed)
     next_params = saved_params.copy()
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
@@ -996,9 +953,10 @@ def generate_with_batch_management(
     next_batch_status_text = ""
     if autogen_checkbox:
         next_batch_status_text = t("messages.autogen_enabled")
-    # Return original results plus batch management state updates
-    return result + (
         current_batch_index,
         total_batches,
         batch_queue,
@@ -1114,7 +1072,8 @@ def generate_next_batch_background(
         params.setdefault("complete_track_classes", [])
         # Call generate_with_progress with the saved parameters
-        result = generate_with_progress(
             dit_handler,
             llm_handler,
             captions=params.get("captions"),
@@ -1159,15 +1118,20 @@ def generate_next_batch_background(
             progress=progress
         )
-        # Extract results
-        all_audio_paths = result[8]  # generated_audio_batch
-        generation_info = result[9]
-        seed_value_for_ui = result[11]
-        lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
         # Extract codes
-        generated_codes_single = result[26]
-        generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)

 import shutil
 import zipfile
 import time as time_module
+from typing import Dict, Any, Optional
 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
+from acestep.inference import generate_music, GenerationParams, GenerationConfig
+from acestep.audio_utils import save_audio
+def _build_generation_info(
+    lm_metadata: Optional[Dict[str, Any]],
+    time_costs: Dict[str, float],
+    seed_value: str,
+    inference_steps: int,
+    num_audios: int,
+) -> str:
+    """Build generation info string from result data.
+    Args:
+        lm_metadata: LM-generated metadata dictionary
+        time_costs: Unified time costs dictionary
+        seed_value: Seed value string
+        inference_steps: Number of inference steps
+        num_audios: Number of generated audios
+    Returns:
+        Formatted generation info string
+    """
+    info_parts = []
+    # Part 1: LM-generated metadata (if available)
+    if lm_metadata:
+        metadata_lines = []
+        if lm_metadata.get('bpm'):
+            metadata_lines.append(f"- **BPM:** {lm_metadata['bpm']}")
+        if lm_metadata.get('caption'):
+            metadata_lines.append(f"- **Refined Caption:** {lm_metadata['caption']}")
+        if lm_metadata.get('lyrics'):
+            metadata_lines.append(f"- **Refined Lyrics:** {lm_metadata['lyrics']}")
+        if lm_metadata.get('duration'):
+            metadata_lines.append(f"- **Duration:** {lm_metadata['duration']} seconds")
+        if lm_metadata.get('keyscale'):
+            metadata_lines.append(f"- **Key Scale:** {lm_metadata['keyscale']}")
+        if lm_metadata.get('language'):
+            metadata_lines.append(f"- **Language:** {lm_metadata['language']}")
+        if lm_metadata.get('timesignature'):
+            metadata_lines.append(f"- **Time Signature:** {lm_metadata['timesignature']}")
+        if metadata_lines:
+            metadata_section = "**🤖 LM-Generated Metadata:**\n" + "\n".join(metadata_lines)
+            info_parts.append(metadata_section)
+    # Part 2: Time costs (formatted and beautified)
+    if time_costs:
+        time_lines = []
+        # LM time costs
+        lm_phase1 = time_costs.get('lm_phase1_time', 0.0)
+        lm_phase2 = time_costs.get('lm_phase2_time', 0.0)
+        lm_total = time_costs.get('lm_total_time', 0.0)
+        if lm_total > 0:
+            time_lines.append("**🧠 LM Time:**")
+            if lm_phase1 > 0:
+                time_lines.append(f"  - Phase 1 (CoT): {lm_phase1:.2f}s")
+            if lm_phase2 > 0:
+                time_lines.append(f"  - Phase 2 (Codes): {lm_phase2:.2f}s")
+            time_lines.append(f"  - Total: {lm_total:.2f}s")
+        # DiT time costs
+        dit_encoder = time_costs.get('dit_encoder_time_cost', 0.0)
+        dit_model = time_costs.get('dit_model_time_cost', 0.0)
+        dit_vae_decode = time_costs.get('dit_vae_decode_time_cost', 0.0)
+        dit_offload = time_costs.get('dit_offload_time_cost', 0.0)
+        dit_total = time_costs.get('dit_total_time_cost', 0.0)
+        if dit_total > 0:
+            time_lines.append("\n**🎵 DiT Time:**")
+            if dit_encoder > 0:
+                time_lines.append(f"  - Encoder: {dit_encoder:.2f}s")
+            if dit_model > 0:
+                time_lines.append(f"  - Model: {dit_model:.2f}s")
+            if dit_vae_decode > 0:
+                time_lines.append(f"  - VAE Decode: {dit_vae_decode:.2f}s")
+            if dit_offload > 0:
+                time_lines.append(f"  - Offload: {dit_offload:.2f}s")
+            time_lines.append(f"  - Total: {dit_total:.2f}s")
+        # Post-processing time costs
+        audio_conversion_time = time_costs.get('audio_conversion_time', 0.0)
+        auto_score_time = time_costs.get('auto_score_time', 0.0)
+        if audio_conversion_time > 0 or auto_score_time > 0:
+            time_lines.append("\n**🔧 Post-processing Time:**")
+            if audio_conversion_time > 0:
+                time_lines.append(f"  - Audio Conversion: {audio_conversion_time:.2f}s")
+            if auto_score_time > 0:
+                time_lines.append(f"  - Auto Score: {auto_score_time:.2f}s")
+        # Pipeline total
+        pipeline_total = time_costs.get('pipeline_total_time', 0.0)
+        if pipeline_total > 0:
+            time_lines.append(f"\n**⏱️ Pipeline Total: {pipeline_total:.2f}s**")
+        if time_lines:
+            time_section = "\n".join(time_lines)
+            info_parts.append(time_section)
+    # Part 3: Generation summary
+    summary_lines = [
+        "**🎵 Generation Complete**",
+        f"  - **Seeds:** {seed_value}",
+        f"  - **Steps:** {inference_steps}",
+        f"  - **Audio Count:** {num_audios} audio(s)",
+    ]
+    info_parts.append("\n".join(summary_lines))
+    # Combine all parts
+    return "\n\n".join(info_parts)
 def store_batch_in_queue(
     auto_score,
     score_scale,
     lm_batch_chunk_size,
+    progress=gr.Progress(track_tqdm=True),
 ):
     """Generate audio with progress tracking"""
+    # step 1: prepare inputs
+    # generate_music, GenerationParams, GenerationConfig
+    gen_params = GenerationParams(
+        task_type=task_type,
+        instruction=instruction_display_gen,
+        reference_audio=reference_audio,
+        src_audio=src_audio,
+        audio_codes=text2music_audio_code_string if not think_checkbox else "",
+        caption=captions or "",
+        lyrics=lyrics or "",
+        instrumental=False,
+        vocal_language=vocal_language,
+        bpm=bpm,
+        keyscale=key_scale,
+        timesignature=time_signature,
+        duration=audio_duration,
+        inference_steps=inference_steps,
+        guidance_scale=guidance_scale,
+        use_adg=use_adg,
+        cfg_interval_start=cfg_interval_start,
+        cfg_interval_end=cfg_interval_end,
+        repainting_start=repainting_start,
+        repainting_end=repainting_end,
+        audio_cover_strength=audio_cover_strength,
+        thinking=think_checkbox,
+        lm_temperature=lm_temperature,
+        lm_cfg_scale=lm_cfg_scale,
+        lm_top_k=lm_top_k,
+        lm_top_p=lm_top_p,
+        lm_negative_prompt=lm_negative_prompt,
+        use_cot_metas=use_cot_metas,
+        use_cot_caption=use_cot_caption,
+        use_cot_language=use_cot_language,
+        use_constrained_decoding=True,
+    )
+    # seed string to list
+    if isinstance(seed, str) and seed.strip():
+        if "," in seed:
+            seed_list = [int(s.strip()) for s in seed.split(",")]
+        else:
+            seed_list = [int(seed.strip())]
+    else:
+        seed_list = None
+    gen_config = GenerationConfig(
+        batch_size=batch_size_input,
+        allow_lm_batch=allow_lm_batch,
+        use_random_seed=random_seed_checkbox,
+        seeds=seed_list,
+        lm_batch_chunk_size=lm_batch_chunk_size,
+        constrained_decoding_debug=constrained_decoding_debug,
+        audio_format=audio_format,
+    )
+    result = generate_music(
+        dit_handler,
+        llm_handler,
+        params=gen_params,
+        config=gen_config,
+        progress=progress,
     )
+    audio_outputs = [None] * 8
+    all_audio_paths = []
+    final_codes_list = [""] * 8
+    final_scores_list = [""] * 8
+    # Build generation_info from result data
+    status_message = result.status_message
+    seed_value_for_ui = result.extra_outputs.get("seed_value", "")
+    lm_generated_metadata = result.extra_outputs.get("lm_metadata", {})
+    time_costs = result.extra_outputs.get("time_costs", {}).copy()
+    # Initialize post-processing timing
+    audio_conversion_start_time = time_module.time()
+    total_auto_score_time = 0.0
+    align_score_1 = ""
+    align_text_1 = ""
+    align_plot_1 = None
+    align_score_2 = ""
+    align_text_2 = ""
+    align_plot_2 = None
+    updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
+    if not result.success:
+        # Build generation_info string for error case
+        generation_info = _build_generation_info(
+            lm_metadata=lm_generated_metadata,
+            time_costs=time_costs,
+            seed_value=seed_value_for_ui,
+            inference_steps=inference_steps,
+            num_audios=0,
+        )
+        yield (None,) * 8 + (None, generation_info, result.status_message) + (gr.skip(),) * 25
+        return
+    audios = result.audios
+    progress(0.99, "Converting audio to mp3...")
+    for i in range(8):
+        if i < len(audios):
+            key = audios[i]["key"]
+            audio_tensor = audios[i]["tensor"]
+            sample_rate = audios[i]["sample_rate"]
+            audio_params = audios[i]["params"]
+            temp_dir = tempfile.mkdtemp(f"acestep_gradio_results/")
+            os.makedirs(temp_dir, exist_ok=True)
+            json_path = os.path.join(temp_dir, f"{key}.json")
+            audio_path = os.path.join(temp_dir, f"{key}.{audio_format}")
+            save_audio(audio_data=audio_tensor, output_path=audio_path, sample_rate=sample_rate, format=audio_format, channels_first=True)
+            audio_outputs[i] = audio_path
+            all_audio_paths.append(audio_path)
+            code_str = audio_params.get("audio_codes", "")
+            final_codes_list[i] = code_str
+            scores_ui_updates = [gr.skip()] * 8
+            score_str = "Done!"
+            if auto_score:
+                auto_score_start = time_module.time()
+                score_str = calculate_score_handler(llm_handler, code_str, captions, lyrics, lm_generated_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale)
+                auto_score_end = time_module.time()
+                total_auto_score_time += (auto_score_end - auto_score_start)
+            scores_ui_updates[i] = score_str
+            final_scores_list[i] = score_str
+            status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
+            current_audio_updates = [gr.skip()] * 8
+            current_audio_updates[i] = audio_path
+            audio_codes_ui_updates = [gr.skip()] * 8
+            audio_codes_ui_updates[i] = code_str
+            yield (
+                current_audio_updates[0], current_audio_updates[1], current_audio_updates[2], current_audio_updates[3],
+                current_audio_updates[4], current_audio_updates[5], current_audio_updates[6], current_audio_updates[7],
+                all_audio_paths,   # Real-time update of Batch File list
+                generation_info,
+                status_message,
+                seed_value_for_ui,
+                # Align plot placeholders (assume no need to update in real time)
+                gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+                # Scores
+                scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
+                updated_audio_codes,
+                # Codes
+                audio_codes_ui_updates[0], audio_codes_ui_updates[1], audio_codes_ui_updates[2], audio_codes_ui_updates[3],
+                audio_codes_ui_updates[4], audio_codes_ui_updates[5], audio_codes_ui_updates[6], audio_codes_ui_updates[7],
+                lm_generated_metadata,
+                is_format_caption,
             )
         else:
+            # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
+            pass
+        time_module.sleep(0.1)
+    # Record audio conversion time
+    audio_conversion_end_time = time_module.time()
+    audio_conversion_time = audio_conversion_end_time - audio_conversion_start_time
+    # Add post-processing times to time_costs
+    if audio_conversion_time > 0:
+        time_costs['audio_conversion_time'] = audio_conversion_time
+    if total_auto_score_time > 0:
+        time_costs['auto_score_time'] = total_auto_score_time
+    # Update pipeline total time to include post-processing
+    if 'pipeline_total_time' in time_costs:
+        time_costs['pipeline_total_time'] += audio_conversion_time + total_auto_score_time
+    # Rebuild generation_info with complete timing information
+    generation_info = _build_generation_info(
+        lm_metadata=lm_generated_metadata,
+        time_costs=time_costs,
+        seed_value=seed_value_for_ui,
+        inference_steps=inference_steps,
+        num_audios=len(result.audios),
+    )
+    yield (
+        gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 1-4: SKIP
+        gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 5-8: SKIP
+        all_audio_paths,
         generation_info,
+        "Generation Complete",
         seed_value_for_ui,
+        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2,
+        final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
+        final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
+        updated_audio_codes,
+        final_codes_list[0], final_codes_list[1], final_codes_list[2], final_codes_list[3],
+        final_codes_list[4], final_codes_list[5], final_codes_list[6], final_codes_list[7],
+        lm_generated_metadata,
+        is_format_caption,
     )
 def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
     """
     Calculate PMI-based quality score for generated audio.
     if stored_allow_lm_batch and isinstance(stored_codes, list):
         # Batch mode: use specific sample's codes
         if 0 <= sample_idx - 1 < len(stored_codes):
+            code_item = stored_codes[sample_idx - 1]
+            # Ensure it's a string (handle cases where dict was mistakenly stored)
+            audio_codes_str = code_item if isinstance(code_item, str) else ""
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
     Wrapper for generate_with_progress that adds batch queue management
     """
     # Call the original generation function
+    generator = generate_with_progress(
         dit_handler, llm_handler,
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
         inference_steps, guidance_scale, random_seed_checkbox, seed,
         lm_batch_chunk_size,
         progress
     )
+    final_result_from_inner = None
+    for partial_result in generator:
+        final_result_from_inner = partial_result
+        # current_batch_index, total_batches, batch_queue, next_params,
+        # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
+        yield partial_result + (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
+        )
+    result = final_result_from_inner
+    all_audio_paths = result[8]
+    if all_audio_paths is None:
+        yield result + (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
+        )
+        return
+    # Extract results from generation (使用 result 下标访问)
     generation_info = result[9]
     seed_value_for_ui = result[11]
+    lm_generated_metadata = result[35]  # Fixed: lm_metadata is at index 35, not 34
     # Extract codes
     generated_codes_single = result[26]
     generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
         codes_to_store = generated_codes_batch[:int(batch_size_input)]
     else:
         codes_to_store = generated_codes_single
     # Save parameters for history
     saved_params = {
         "captions": captions,
     }
     # Next batch parameters (with cleared codes & random seed)
+    # Next batch parameters
     next_params = saved_params.copy()
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
     next_batch_status_text = ""
     if autogen_checkbox:
         next_batch_status_text = t("messages.autogen_enabled")
+    # 4. Yield final result (includes Batch UI updates)
+    # The result here is already a tuple structure
+    yield result + (
         current_batch_index,
         total_batches,
         batch_queue,
         params.setdefault("complete_track_classes", [])
         # Call generate_with_progress with the saved parameters
+        # Note: generate_with_progress is a generator, need to iterate through it
+        generator = generate_with_progress(
             dit_handler,
             llm_handler,
             captions=params.get("captions"),
             progress=progress
         )
+        # Consume generator to get final result (similar to generate_with_batch_management)
+        final_result = None
+        for partial_result in generator:
+            final_result = partial_result
+        # Extract results from final_result
+        all_audio_paths = final_result[8]  # generated_audio_batch
+        generation_info = final_result[9]
+        seed_value_for_ui = final_result[11]
+        lm_generated_metadata = final_result[35]  # Fixed: lm_metadata is at index 35, not 34
         # Extract codes
+        generated_codes_single = final_result[26]
+        generated_codes_batch = [final_result[27], final_result[28], final_result[29], final_result[30], final_result[31], final_result[32], final_result[33], final_result[34]]
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)

acestep/gradio_ui/interfaces/result.py CHANGED Viewed

@@ -28,7 +28,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_1 = gr.Audio(
                     label=t("results.generated_music", n=1),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
@@ -58,7 +59,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
@@ -88,7 +90,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_3 = gr.Button(
@@ -118,7 +121,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_4 = gr.Button(
@@ -151,7 +155,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_5 = gr.Audio(
                     label=t("results.generated_music", n=5),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -166,7 +171,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -181,7 +187,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -196,7 +203,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)

                 generated_audio_1 = gr.Audio(
                     label=t("results.generated_music", n=1),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_3 = gr.Button(
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_4 = gr.Button(
                 generated_audio_5 = gr.Audio(
                     label=t("results.generated_music", n=5),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)

acestep/handler.py CHANGED Viewed

@@ -2077,7 +2077,6 @@ class AceStepHandler:
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
             return {
                 "audios": [],
-                "generation_info": "",
                 "status_message": "❌ Model not fully initialized. Please initialize all components first.",
                 "extra_outputs": {},
                 "success": False,
@@ -2101,7 +2100,7 @@ class AceStepHandler:
         logger.info("[generate_music] Starting generation...")
         if progress:
-            progress(0.05, desc="Preparing inputs...")
         logger.info("[generate_music] Preparing inputs...")
         # Reset offload cost
@@ -2123,8 +2122,6 @@ class AceStepHandler:
             repainting_end = None
         try:
-            progress(0.1, desc="Preparing inputs...")
             # 1. Process reference audio
             refer_audios = None
             if reference_audio is not None:
@@ -2176,7 +2173,7 @@ class AceStepHandler:
                 can_use_repainting
             )
-            progress(0.3, desc=f"Generating music (batch size: {actual_batch_size})...")
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
@@ -2245,7 +2242,7 @@ class AceStepHandler:
             logger.info("[generate_music] VAE decode completed. Preparing audio tensors...")
             if progress:
-                progress(0.9, desc="Preparing audio data...")
             # Prepare audio tensors (no file I/O here, no UUID generation)
             # pred_wavs is already [batch, channels, samples] format
@@ -2257,23 +2254,6 @@ class AceStepHandler:
                 audio_tensor = pred_wavs[i].cpu().float()
                 audio_tensors.append(audio_tensor)
-            # Format time costs if available
-            time_costs_str = ""
-            if time_costs:
-                if isinstance(time_costs, dict):
-                    time_costs_str = "\n\n**⏱️ Time Costs:**\n"
-                    for key, value in time_costs.items():
-                        # Format key: encoder_time_cost -> Encoder
-                        formatted_key = key.replace("_time_cost", "").replace("_", " ").title()
-                        time_costs_str += f"  - {formatted_key}: {value:.2f}s\n"
-                elif isinstance(time_costs, (int, float)):
-                    time_costs_str = f"\n\n**⏱️ Time Cost:** {time_costs:.2f}s"
-            generation_info = f"""**🎵 Generation Complete**
-    **Seeds:** {seed_value_for_ui}
-    **Steps:** {inference_steps}
-    **Audio Count:** {len(audio_tensors)} audio(s){time_costs_str}"""
             status_message = f"✅ Generation completed successfully!"
             logger.info(f"[generate_music] Done! Generated {len(audio_tensors)} audio tensors.")
@@ -2307,7 +2287,6 @@ class AceStepHandler:
             return {
                 "audios": audios,
-                "generation_info": generation_info,
                 "status_message": status_message,
                 "extra_outputs": extra_outputs,
                 "success": True,
@@ -2319,7 +2298,6 @@ class AceStepHandler:
             logger.exception("[generate_music] Generation failed")
             return {
                 "audios": [],
-                "generation_info": "",
                 "status_message": error_msg,
                 "extra_outputs": {},
                 "success": False,

         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
             return {
                 "audios": [],
                 "status_message": "❌ Model not fully initialized. Please initialize all components first.",
                 "extra_outputs": {},
                 "success": False,
         logger.info("[generate_music] Starting generation...")
         if progress:
+            progress(0.51, desc="Preparing inputs...")
         logger.info("[generate_music] Preparing inputs...")
         # Reset offload cost
             repainting_end = None
         try:
             # 1. Process reference audio
             refer_audios = None
             if reference_audio is not None:
                 can_use_repainting
             )
+            progress(0.52, desc=f"Generating music (batch size: {actual_batch_size})...")
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
             logger.info("[generate_music] VAE decode completed. Preparing audio tensors...")
             if progress:
+                progress(0.99, desc="Preparing audio data...")
             # Prepare audio tensors (no file I/O here, no UUID generation)
             # pred_wavs is already [batch, channels, samples] format
                 audio_tensor = pred_wavs[i].cpu().float()
                 audio_tensors.append(audio_tensor)
             status_message = f"✅ Generation completed successfully!"
             logger.info(f"[generate_music] Done! Generated {len(audio_tensors)} audio tensors.")
             return {
                 "audios": audios,
                 "status_message": status_message,
                 "extra_outputs": extra_outputs,
                 "success": True,
             logger.exception("[generate_music] Generation failed")
             return {
                 "audios": [],
                 "status_message": error_msg,
                 "extra_outputs": {},
                 "success": False,

acestep/inference.py CHANGED Viewed

@@ -67,19 +67,19 @@ class GenerationParams:
     # Required Inputs
     task_type: str = "text2music"
     instruction: str = "Fill the audio semantic mask based on the given conditions:"
     # Audio Uploads
     reference_audio: Optional[str] = None
     src_audio: Optional[str] = None
     # LM Codes Hints
     audio_codes: str = ""
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
     instrumental: bool = False
     # Metadata
     vocal_language: str = "unknown"
     bpm: Optional[int] = None
@@ -98,7 +98,7 @@ class GenerationParams:
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
     # 5Hz Language Model Parameters
     thinking: bool = True
     lm_temperature: float = 0.85
@@ -108,8 +108,18 @@ class GenerationParams:
     lm_negative_prompt: str = "NO USER INPUT"
     use_cot_metas: bool = True
     use_cot_caption: bool = True
     use_cot_language: bool = True
     def to_dict(self) -> Dict[str, Any]:
         """Convert config to dictionary for JSON serialization."""
         return asdict(self)
@@ -123,25 +133,27 @@ class GenerationConfig:
         batch_size: Number of audio samples to generate
         allow_lm_batch: Whether to allow batch processing in LM
         use_random_seed: Whether to use random seed
-        seed: Seed(s) for batch generation. Can be:
             - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
             - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
             - int: Single seed value (will be converted to list and padded)
         lm_batch_chunk_size: Batch chunk size for LM processing
-        is_format_caption: Whether to format caption
         constrained_decoding_debug: Whether to enable constrained decoding debug
         audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
     """
     batch_size: int = 2
     allow_lm_batch: bool = False
     use_random_seed: bool = True
-    seed: Optional[Union[int, List[int]]] = None
     lm_batch_chunk_size: int = 8
-    is_format_caption: bool = False
-    use_constrained_decoding: bool = True
     constrained_decoding_debug: bool = False
     audio_format: str = "flac"  # Default to FLAC for fast saving
 @dataclass
 class GenerationResult:
     """Result of music generation.
@@ -149,34 +161,80 @@ class GenerationResult:
     Attributes:
         # Audio Outputs
         audios: List of audio dictionaries with paths, keys, params
-        generation_info: Markdown-formatted generation information
         status_message: Status message from generation
         extra_outputs: Extra outputs from generation
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
     audios: List[Dict[str, Any]] = field(default_factory=list)
     # Generation Information
-    generation_info: str = ""
     status_message: str = ""
     extra_outputs: Dict[str, Any] = field(default_factory=dict)
     # Success Status
     success: bool = True
     error: Optional[str] = None
     def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for JSON serialization."""
         return asdict(self)
 def generate_music(
     dit_handler,
     llm_handler,
     params: GenerationParams,
     config: GenerationConfig,
     save_dir: Optional[str] = None,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
@@ -194,24 +252,31 @@ def generate_music(
         audio_code_string_to_use = params.audio_codes
         lm_generated_metadata = None
         lm_generated_audio_codes_list = []
         # Extract mutable copies of metadata (will be updated by LM if needed)
         bpm = params.bpm
         key_scale = params.keyscale
         time_signature = params.timesignature
         audio_duration = params.duration
         # Determine if we need to generate audio codes
         # If user has provided audio_codes, we don't need to generate them
         # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
         user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
         # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
         # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
         # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
         # Note: This logic can be refined based on specific requirements
         need_audio_codes = not user_provided_audio_codes
         # Determine if we should use chunk-based LM generation (always use chunks for consistency)
         # Determine actual batch size for chunk processing
         actual_batch_size = config.batch_size if config.batch_size is not None else 1
@@ -219,80 +284,75 @@ def generate_music(
         # Prepare seeds for batch generation
         # Use config.seed if provided, otherwise fallback to params.seed
         # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
-        seed_for_generation = params.seed  # Default fallback
-        if config.seed is not None:
-            if isinstance(config.seed, list):
                 # Convert List[int] to comma-separated string
-                seed_for_generation = ",".join(str(s) for s in config.seed)
-            elif isinstance(config.seed, int):
-                # Single int seed
-                seed_for_generation = config.seed
         # Use dit_handler.prepare_seeds to handle seed list generation and padding
         # This will handle all the logic: padding with random seeds if needed, etc.
-        actual_seed_list, _ = dit_handler.prepare_seeds(
-            actual_batch_size, seed_for_generation, config.use_random_seed
-        )
         # LM-based Chain-of-Thought reasoning
-        if params.thinking and llm_handler.llm_initialized and params.use_cot_metas:
-            # Convert sampling parameters
-            top_k_value = None if params.lm_top_k == 0 else int(params.lm_top_k)
-            top_p_value = None if params.lm_top_p >= 1.0 else params.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
             if bpm is not None:
                 try:
                     bpm_value = float(bpm)
                     if bpm_value > 0:
-                        user_metadata['bpm'] = str(int(bpm_value))
                 except (ValueError, TypeError):
                     pass
             if key_scale and key_scale.strip():
                 key_scale_clean = key_scale.strip()
                 if key_scale_clean.lower() not in ["n/a", ""]:
                     user_metadata['keyscale'] = key_scale_clean
             if time_signature and time_signature.strip():
                 time_sig_clean = time_signature.strip()
                 if time_sig_clean.lower() not in ["n/a", ""]:
                     user_metadata['timesignature'] = time_sig_clean
             if audio_duration is not None:
                 try:
                     duration_value = float(audio_duration)
                     if duration_value > 0:
-                        user_metadata['duration'] = str(int(duration_value))
                 except (ValueError, TypeError):
                     pass
             user_metadata_to_pass = user_metadata if user_metadata else None
             # Determine infer_type based on whether we need audio codes
             # - "llm_dit": generates both metas and audio codes (two-phase internally)
             # - "dit": generates only metas (single phase)
             infer_type = "llm_dit" if need_audio_codes else "dit"
             # Use chunk size from config, or default to batch_size if not set
             max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
             num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
             all_metadata_list = []
             all_audio_codes_list = []
             for chunk_idx in range(num_chunks):
                 chunk_start = chunk_idx * max_inference_batch_size
                 chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
                 chunk_size = chunk_end - chunk_start
                 chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
-                logger.info(
-                    f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
-                    f"(size: {chunk_size}, seeds: {chunk_seeds})"
-                )
                 # Use the determined infer_type
                 # - "llm_dit" will internally run two phases (metas + codes)
                 # - "dit" will only run phase 1 (metas only)
@@ -308,25 +368,54 @@ def generate_music(
                     user_metadata=user_metadata_to_pass,
                     use_cot_caption=params.use_cot_caption,
                     use_cot_language=params.use_cot_language,
-                    is_format_caption=config.is_format_caption,
-                    use_constrained_decoding=config.use_constrained_decoding,
                     constrained_decoding_debug=config.constrained_decoding_debug,
                     batch_size=chunk_size,
                     seeds=chunk_seeds,
                 )
                 if chunk_size > 1:
-                    metadata_list, audio_codes_list, status = result
                     all_metadata_list.extend(metadata_list)
                     all_audio_codes_list.extend(audio_codes_list)
                 else:
-                    metadata, audio_codes, status = result
                     all_metadata_list.append(metadata)
                     all_audio_codes_list.append(audio_codes)
             lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
             lm_generated_audio_codes_list = all_audio_codes_list
             # Set audio_code_string_to_use based on infer_type
             if infer_type == "llm_dit":
                 # If batch mode, use list; otherwise use single string
@@ -337,23 +426,48 @@ def generate_music(
             else:
                 # For "dit" mode, keep user-provided codes or empty
                 audio_code_string_to_use = params.audio_codes
             # Update metadata from LM if not provided by user
             if lm_generated_metadata:
-                bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
-                    lm_generated_metadata, bpm, key_scale, time_signature, audio_duration
-                )
         # Phase 2: DiT music generation
         # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
         result = dit_handler.generate_music(
-            captions=params.caption,
-            lyrics=params.lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
-            vocal_language=params.vocal_language,
             inference_steps=params.inference_steps,
             guidance_scale=params.guidance_scale,
             use_random_seed=config.use_random_seed,
@@ -371,110 +485,80 @@ def generate_music(
             use_adg=params.use_adg,
             cfg_interval_start=params.cfg_interval_start,
             cfg_interval_end=params.cfg_interval_end,
         )
         # Check if generation failed
         if not result.get("success", False):
             return GenerationResult(
                 audios=[],
-                generation_info=result.get("generation_info", ""),
                 status_message=result.get("status_message", ""),
                 extra_outputs={},
                 success=False,
                 error=result.get("error"),
             )
         # Extract results from dit_handler.generate_music dict
         dit_audios = result.get("audios", [])
-        generation_info = result.get("generation_info", "")
         status_message = result.get("status_message", "")
         dit_extra_outputs = result.get("extra_outputs", {})
-        # Append LM metadata to generation info
-        if lm_generated_metadata:
-            generation_info = _append_lm_metadata_to_info(generation_info, lm_generated_metadata)
         # Use the seed list already prepared above (from config.seed or params.seed fallback)
         # actual_seed_list was computed earlier using dit_handler.prepare_seeds
         seed_list = actual_seed_list
         # Get base params dictionary
         base_params_dict = params.to_dict()
         # Save audio files using AudioSaver (format from config)
         audio_format = config.audio_format if config.audio_format else "flac"
         audio_saver = AudioSaver(default_format=audio_format)
         # Use handler's temp_dir for saving files
         if save_dir is not None:
             os.makedirs(save_dir, exist_ok=True)
         # Build audios list for GenerationResult with params and save files
         # Audio saving and UUID generation handled here, outside of handler
         audios = []
         for idx, dit_audio in enumerate(dit_audios):
             # Create a copy of params dict for this audio
             audio_params = base_params_dict.copy()
             # Update audio-specific values
             audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
             # Add audio codes if batch mode
             if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
                 audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
             # Get audio tensor and metadata
             audio_tensor = dit_audio.get("tensor")
             sample_rate = dit_audio.get("sample_rate", 48000)
             # Generate UUID for this audio (moved from handler)
             batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
-            audio_code_str = lm_generated_audio_codes_list[idx] if (lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
             if isinstance(audio_code_str, list):
                 audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
-            audio_key = generate_uuid_from_params(
-                captions=params.caption,
-                lyrics=params.lyrics,
-                bpm=bpm,
-                key_scale=key_scale,
-                time_signature=time_signature,
-                vocal_language=params.vocal_language,
-                inference_steps=params.inference_steps,
-                guidance_scale=params.guidance_scale,
-                seed=batch_seed,
-                audio_duration=audio_duration,
-                audio_code_string=audio_code_str,
-                repainting_start=params.repainting_start,
-                repainting_end=params.repainting_end,
-                instruction=params.instruction,
-                audio_cover_strength=params.audio_cover_strength,
-                task_type=params.task_type,
-                use_adg=params.use_adg,
-                cfg_interval_start=params.cfg_interval_start,
-                cfg_interval_end=params.cfg_interval_end,
-                audio_format=audio_format,
-                reference_audio=params.reference_audio,
-                src_audio=params.src_audio,
-                batch_index=idx,
-            )
             # Save audio file (handled outside handler)
             audio_path = None
             if audio_tensor is not None and save_dir is not None:
                 try:
                     audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
-                    audio_path = audio_saver.save_audio(
-                        audio_tensor,
-                        audio_file,
-                        sample_rate=sample_rate,
-                        format=audio_format,
-                        channels_first=True
-                    )
                 except Exception as e:
                     logger.error(f"[generate_music] Failed to save audio file: {e}")
                     audio_path = ""  # Fallback to empty path
             audio_dict = {
                 "path": audio_path or "",  # File path (saved here, not in handler)
                 "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
@@ -482,259 +566,55 @@ def generate_music(
                 "sample_rate": sample_rate,
                 "params": audio_params,
             }
             audios.append(audio_dict)
         # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
         extra_outputs = dit_extra_outputs.copy()
         extra_outputs["lm_metadata"] = lm_generated_metadata
         # Create and return GenerationResult
         return GenerationResult(
             audios=audios,
-            generation_info=generation_info,
             status_message=status_message,
             extra_outputs=extra_outputs,
             success=True,
             error=None,
         )
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
             audios=[],
-            generation_info=f"❌ Generation failed: {str(e)}",
             status_message=f"Error: {str(e)}",
             extra_outputs={},
             success=False,
             error=str(e),
         )
-def _update_metadata_from_lm(
-    metadata: Dict[str, Any],
-    bpm: Optional[int],
-    key_scale: str,
-    time_signature: str,
-    audio_duration: Optional[float],
-) -> Tuple[Optional[int], str, str, Optional[float]]:
-    """Update metadata fields from LM output if not provided by user."""
-    if bpm is None and metadata.get('bpm'):
-        bpm_value = metadata.get('bpm')
-        if bpm_value not in ["N/A", ""]:
-            try:
-                bpm = int(bpm_value)
-            except (ValueError, TypeError):
-                pass
-    if not key_scale and metadata.get('keyscale'):
-        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
-        if key_scale_value != "N/A":
-            key_scale = key_scale_value
-    if not time_signature and metadata.get('timesignature'):
-        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
-        if time_signature_value != "N/A":
-            time_signature = time_signature_value
-    if audio_duration is None or audio_duration <= 0:
-        audio_duration_value = metadata.get('duration', -1)
-        if audio_duration_value not in ["N/A", ""]:
-            try:
-                audio_duration = float(audio_duration_value)
-            except (ValueError, TypeError):
-                pass
-    return bpm, key_scale, time_signature, audio_duration
-def _append_lm_metadata_to_info(generation_info: str, metadata: Dict[str, Any]) -> str:
-    """Append LM-generated metadata to generation info string."""
-    metadata_lines = []
-    if metadata.get('bpm'):
-        metadata_lines.append(f"- **BPM:** {metadata['bpm']}")
-    if metadata.get('caption'):
-        metadata_lines.append(f"- **Refined Caption:** {metadata['caption']}")
-    if metadata.get('duration'):
-        metadata_lines.append(f"- **Duration:** {metadata['duration']} seconds")
-    if metadata.get('keyscale'):
-        metadata_lines.append(f"- **Key Scale:** {metadata['keyscale']}")
-    if metadata.get('language'):
-        metadata_lines.append(f"- **Language:** {metadata['language']}")
-    if metadata.get('timesignature'):
-        metadata_lines.append(f"- **Time Signature:** {metadata['timesignature']}")
-    if metadata_lines:
-        metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
-        return metadata_section + "\n\n" + generation_info
-    return generation_info
-# ============================================================================
-# LEGACY GRADIO UI COMPATIBILITY LAYER
-# ============================================================================
-def generate_for_gradio(
-    dit_handler,
-    llm_handler,
-    captions,
-    lyrics,
-    bpm,
-    key_scale,
-    time_signature,
-    vocal_language,
-    inference_steps,
-    guidance_scale,
-    random_seed_checkbox,
-    seed,
-    reference_audio,
-    audio_duration,
-    batch_size_input,
-    src_audio,
-    text2music_audio_code_string,
-    repainting_start,
-    repainting_end,
-    instruction_display_gen,
-    audio_cover_strength,
-    task_type,
-    use_adg,
-    cfg_interval_start,
-    cfg_interval_end,
-    audio_format,
-    lm_temperature,
-    think_checkbox,
-    lm_cfg_scale,
-    lm_top_k,
-    lm_top_p,
-    lm_negative_prompt,
-    use_cot_metas,
-    use_cot_caption,
-    use_cot_language,
-    is_format_caption,
-    constrained_decoding_debug,
-    allow_lm_batch,
-    lm_batch_chunk_size,
-):
-    """Legacy Gradio UI compatibility wrapper.
-    This function maintains backward compatibility with the Gradio UI.
-    For new integrations, use generate_music() with GenerationConfig instead.
-    Returns:
-        Tuple with 28 elements for Gradio UI component updates
-    """
-    # Convert legacy parameters to GenerationParams and GenerationConfig
-    params = GenerationParams(
-        caption=captions,
-        lyrics=lyrics,
-        bpm=bpm,
-        keyscale=key_scale,
-        timesignature=time_signature,
-        vocal_language=vocal_language,
-        audio_codes=text2music_audio_code_string,
-        duration=audio_duration,
-        inference_steps=inference_steps,
-        guidance_scale=guidance_scale,
-        seed=seed,
-        use_adg=use_adg,
-        cfg_interval_start=cfg_interval_start,
-        cfg_interval_end=cfg_interval_end,
-        audio_format=audio_format,
-        task_type=task_type,
-        reference_audio=reference_audio,
-        src_audio=src_audio,
-        repainting_start=repainting_start,
-        repainting_end=repainting_end,
-        audio_cover_strength=audio_cover_strength,
-        instruction=instruction_display_gen,
-        thinking=think_checkbox,
-        lm_temperature=lm_temperature,
-        lm_cfg_scale=lm_cfg_scale,
-        lm_top_k=lm_top_k,
-        lm_top_p=lm_top_p,
-        lm_negative_prompt=lm_negative_prompt,
-        use_cot_metas=use_cot_metas,
-        use_cot_caption=use_cot_caption,
-        use_cot_language=use_cot_language,
-    )
-    config = GenerationConfig(batch_size=1)
-    config.batch_size = batch_size_input
-    config.use_random_seed = random_seed_checkbox
-    config.allow_lm_batch = allow_lm_batch
-    config.lm_batch_chunk_size = lm_batch_chunk_size
-    config.is_format_caption = is_format_caption
-    config.constrained_decoding_debug = constrained_decoding_debug
-    # Call new API
-    result = generate_music(dit_handler, llm_handler, params, config)
-    # Extract audio paths from result.audios
-    audio_paths = [audio["path"] for audio in result.audios]
-    # Extract extra outputs
-    extra_outputs = result.extra_outputs
-    seed_value = extra_outputs.get("seed_value", "")
-    lm_metadata = extra_outputs.get("lm_metadata", None)
-    # Legacy alignment fields (no longer used, set to empty/None)
-    align_score_1 = ""
-    align_text_1 = ""
-    align_plot_1 = None
-    align_score_2 = ""
-    align_text_2 = ""
-    align_plot_2 = None
-    # Determine which codes to update in UI
-    if config.allow_lm_batch and lm_metadata:
-        # Batch mode: extract codes from metadata if available
-        lm_codes_list = lm_metadata.get('audio_codes_list', [])
-        updated_audio_codes = lm_codes_list[0] if lm_codes_list else text2music_audio_code_string
-        codes_outputs = (lm_codes_list + [""] * 8)[:8]
-    else:
-        # Single mode
-        lm_codes = lm_metadata.get('audio_codes', '') if lm_metadata else ''
-        updated_audio_codes = lm_codes if lm_codes else text2music_audio_code_string
-        codes_outputs = [""] * 8
-    # Prepare audio outputs (up to 8)
-    audio_outputs = (audio_paths + [None] * 8)[:8]
-    # Return tuple for Gradio UI (28 elements)
-    return (
-        audio_outputs[0],  # generated_audio_1
-        audio_outputs[1],  # generated_audio_2
-        audio_outputs[2],  # generated_audio_3
-        audio_outputs[3],  # generated_audio_4
-        audio_outputs[4],  # generated_audio_5
-        audio_outputs[5],  # generated_audio_6
-        audio_outputs[6],  # generated_audio_7
-        audio_outputs[7],  # generated_audio_8
-        audio_paths,  # generated_audio_batch
-        result.generation_info,
-        result.status_message,
-        seed_value,
-        align_score_1,
-        align_text_1,
-        align_plot_1,
-        align_score_2,
-        align_text_2,
-        align_plot_2,
-        updated_audio_codes,  # Update main audio codes in UI
-        codes_outputs[0],  # text2music_audio_code_string_1
-        codes_outputs[1],  # text2music_audio_code_string_2
-        codes_outputs[2],  # text2music_audio_code_string_3
-        codes_outputs[3],  # text2music_audio_code_string_4
-        codes_outputs[4],  # text2music_audio_code_string_5
-        codes_outputs[5],  # text2music_audio_code_string_6
-        codes_outputs[6],  # text2music_audio_code_string_7
-        codes_outputs[7],  # text2music_audio_code_string_8
-        lm_metadata,  # Store metadata for "Send to src audio" buttons
-        is_format_caption,  # Keep is_format_caption unchanged
-    )

     # Required Inputs
     task_type: str = "text2music"
     instruction: str = "Fill the audio semantic mask based on the given conditions:"
     # Audio Uploads
     reference_audio: Optional[str] = None
     src_audio: Optional[str] = None
     # LM Codes Hints
     audio_codes: str = ""
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
     instrumental: bool = False
     # Metadata
     vocal_language: str = "unknown"
     bpm: Optional[int] = None
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
     # 5Hz Language Model Parameters
     thinking: bool = True
     lm_temperature: float = 0.85
     lm_negative_prompt: str = "NO USER INPUT"
     use_cot_metas: bool = True
     use_cot_caption: bool = True
+    use_cot_lyrics: bool = False  # TODO: not used yet
     use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
     def to_dict(self) -> Dict[str, Any]:
         """Convert config to dictionary for JSON serialization."""
         return asdict(self)
         batch_size: Number of audio samples to generate
         allow_lm_batch: Whether to allow batch processing in LM
         use_random_seed: Whether to use random seed
+        seeds: Seed(s) for batch generation. Can be:
             - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
             - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
             - int: Single seed value (will be converted to list and padded)
         lm_batch_chunk_size: Batch chunk size for LM processing
         constrained_decoding_debug: Whether to enable constrained decoding debug
         audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
     """
     batch_size: int = 2
     allow_lm_batch: bool = False
     use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
     lm_batch_chunk_size: int = 8
     constrained_decoding_debug: bool = False
     audio_format: str = "flac"  # Default to FLAC for fast saving
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
 @dataclass
 class GenerationResult:
     """Result of music generation.
     Attributes:
         # Audio Outputs
         audios: List of audio dictionaries with paths, keys, params
         status_message: Status message from generation
         extra_outputs: Extra outputs from generation
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
     audios: List[Dict[str, Any]] = field(default_factory=list)
     # Generation Information
     status_message: str = ""
     extra_outputs: Dict[str, Any] = field(default_factory=dict)
     # Success Status
     success: bool = True
     error: Optional[str] = None
     def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for JSON serialization."""
         return asdict(self)
+def _update_metadata_from_lm(
+    metadata: Dict[str, Any],
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    audio_duration: Optional[float],
+    vocal_language: str,
+    caption: str,
+    lyrics: str,
+) -> Tuple[Optional[int], str, str, Optional[float]]:
+    """Update metadata fields from LM output if not provided by user."""
+    if bpm is None and metadata.get('bpm'):
+        bpm_value = metadata.get('bpm')
+        if bpm_value not in ["N/A", ""]:
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+    if not key_scale and metadata.get('keyscale'):
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value != "N/A":
+            key_scale = key_scale_value
+    if not time_signature and metadata.get('timesignature'):
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value != "N/A":
+            time_signature = time_signature_value
+    if audio_duration is None or audio_duration <= 0:
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value not in ["N/A", ""]:
+            try:
+                audio_duration = float(audio_duration_value)
+            except (ValueError, TypeError):
+                pass
+    if not vocal_language and metadata.get('vocal_language'):
+        vocal_language = metadata.get('vocal_language')
+    if not caption and metadata.get('caption'):
+        caption = metadata.get('caption')
+    if not lyrics and metadata.get('lyrics'):
+        lyrics = metadata.get('lyrics')
+    return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
 def generate_music(
     dit_handler,
     llm_handler,
     params: GenerationParams,
     config: GenerationConfig,
     save_dir: Optional[str] = None,
+    progress=None,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
         audio_code_string_to_use = params.audio_codes
         lm_generated_metadata = None
         lm_generated_audio_codes_list = []
+        lm_total_time_costs = {
+            "phase1_time": 0.0,
+            "phase2_time": 0.0,
+            "total_time": 0.0,
+        }
         # Extract mutable copies of metadata (will be updated by LM if needed)
         bpm = params.bpm
         key_scale = params.keyscale
         time_signature = params.timesignature
         audio_duration = params.duration
+        dit_input_caption = params.caption
+        dit_input_vocal_language = params.vocal_language
+        dit_input_lyrics = params.lyrics
         # Determine if we need to generate audio codes
         # If user has provided audio_codes, we don't need to generate them
         # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
         user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
         # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
         # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
         # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
         # Note: This logic can be refined based on specific requirements
         need_audio_codes = not user_provided_audio_codes
         # Determine if we should use chunk-based LM generation (always use chunks for consistency)
         # Determine actual batch size for chunk processing
         actual_batch_size = config.batch_size if config.batch_size is not None else 1
         # Prepare seeds for batch generation
         # Use config.seed if provided, otherwise fallback to params.seed
         # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = ""
+        if config.seeds is not None and len(config.seeds) > 0:
+            if isinstance(config.seeds, list):
                 # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seeds)
         # Use dit_handler.prepare_seeds to handle seed list generation and padding
         # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(actual_batch_size, seed_for_generation, config.use_random_seed)
         # LM-based Chain-of-Thought reasoning
+        use_lm = params.thinking and llm_handler.llm_initialized
+        lm_status = []
+        if use_lm:
+            # Convert sampling parameters - handle None values safely
+            top_k_value = None if not params.lm_top_k or params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if not params.lm_top_p or params.lm_top_p >= 1.0 else params.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
             if bpm is not None:
                 try:
                     bpm_value = float(bpm)
                     if bpm_value > 0:
+                        user_metadata['bpm'] = int(bpm_value)
                 except (ValueError, TypeError):
                     pass
             if key_scale and key_scale.strip():
                 key_scale_clean = key_scale.strip()
                 if key_scale_clean.lower() not in ["n/a", ""]:
                     user_metadata['keyscale'] = key_scale_clean
             if time_signature and time_signature.strip():
                 time_sig_clean = time_signature.strip()
                 if time_sig_clean.lower() not in ["n/a", ""]:
                     user_metadata['timesignature'] = time_sig_clean
             if audio_duration is not None:
                 try:
                     duration_value = float(audio_duration)
                     if duration_value > 0:
+                        user_metadata['duration'] = int(duration_value)
                 except (ValueError, TypeError):
                     pass
             user_metadata_to_pass = user_metadata if user_metadata else None
             # Determine infer_type based on whether we need audio codes
             # - "llm_dit": generates both metas and audio codes (two-phase internally)
             # - "dit": generates only metas (single phase)
             infer_type = "llm_dit" if need_audio_codes else "dit"
             # Use chunk size from config, or default to batch_size if not set
             max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
             num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
             all_metadata_list = []
             all_audio_codes_list = []
             for chunk_idx in range(num_chunks):
                 chunk_start = chunk_idx * max_inference_batch_size
                 chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
                 chunk_size = chunk_end - chunk_start
                 chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                            f"(size: {chunk_size}, seeds: {chunk_seeds})")
                 # Use the determined infer_type
                 # - "llm_dit" will internally run two phases (metas + codes)
                 # - "dit" will only run phase 1 (metas only)
                     user_metadata=user_metadata_to_pass,
                     use_cot_caption=params.use_cot_caption,
                     use_cot_language=params.use_cot_language,
+                    use_cot_metas=params.use_cot_metas,
+                    use_constrained_decoding=params.use_constrained_decoding,
                     constrained_decoding_debug=config.constrained_decoding_debug,
                     batch_size=chunk_size,
                     seeds=chunk_seeds,
+                    progress=progress,
                 )
+                # Check if LM generation failed
+                if not result.get("success", False):
+                    error_msg = result.get("error", "Unknown LM error")
+                    lm_status.append(f"❌ LM Error: {error_msg}")
+                    # Return early with error
+                    return GenerationResult(
+                        audios=[],
+                        status_message=f"❌ LM generation failed: {error_msg}",
+                        extra_outputs={},
+                        success=False,
+                        error=error_msg,
+                    )
+                # Extract metadata and audio_codes from result dict
                 if chunk_size > 1:
+                    metadata_list = result.get("metadata", [])
+                    audio_codes_list = result.get("audio_codes", [])
                     all_metadata_list.extend(metadata_list)
                     all_audio_codes_list.extend(audio_codes_list)
                 else:
+                    metadata = result.get("metadata", {})
+                    audio_codes = result.get("audio_codes", "")
                     all_metadata_list.append(metadata)
                     all_audio_codes_list.append(audio_codes)
+                # Collect time costs from LM extra_outputs
+                lm_extra = result.get("extra_outputs", {})
+                lm_chunk_time_costs = lm_extra.get("time_costs", {})
+                if lm_chunk_time_costs:
+                    # Accumulate time costs from all chunks
+                    for key in ["phase1_time", "phase2_time", "total_time"]:
+                        if key in lm_chunk_time_costs:
+                            lm_total_time_costs[key] += lm_chunk_time_costs[key]
+                    time_str = ", ".join([f"{k}: {v:.2f}s" for k, v in lm_chunk_time_costs.items()])
+                    lm_status.append(f"✅ LM chunk {chunk_idx+1}: {time_str}")
             lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
             lm_generated_audio_codes_list = all_audio_codes_list
             # Set audio_code_string_to_use based on infer_type
             if infer_type == "llm_dit":
                 # If batch mode, use list; otherwise use single string
             else:
                 # For "dit" mode, keep user-provided codes or empty
                 audio_code_string_to_use = params.audio_codes
             # Update metadata from LM if not provided by user
             if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics = _update_metadata_from_lm(
+                    metadata=lm_generated_metadata,
+                    bpm=bpm,
+                    key_scale=key_scale,
+                    time_signature=time_signature,
+                    audio_duration=audio_duration,
+                    vocal_language=dit_input_vocal_language,
+                    caption=dit_input_caption,
+                    lyrics=dit_input_lyrics)
+                if not params.bpm:
+                    params.cot_bpm = bpm
+                if not params.keyscale:
+                    params.cot_keyscale = key_scale
+                if not params.timesignature:
+                    params.cot_timesignature = time_signature
+                if not params.duration:
+                    params.cot_duration = audio_duration
+                if not params.vocal_language:
+                    params.cot_vocal_language = vocal_language
+                if not params.caption:
+                    params.cot_caption = caption
+                if not params.lyrics:
+                    params.cot_lyrics = lyrics
+            # set cot caption and language if needed
+            if params.use_cot_caption:
+                dit_input_caption = lm_generated_metadata.get("caption", dit_input_caption)
+            if params.use_cot_language:
+                dit_input_vocal_language = lm_generated_metadata.get("vocal_language", dit_input_vocal_language)
         # Phase 2: DiT music generation
         # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
         result = dit_handler.generate_music(
+            captions=dit_input_caption,
+            lyrics=dit_input_lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
+            vocal_language=dit_input_vocal_language,
             inference_steps=params.inference_steps,
             guidance_scale=params.guidance_scale,
             use_random_seed=config.use_random_seed,
             use_adg=params.use_adg,
             cfg_interval_start=params.cfg_interval_start,
             cfg_interval_end=params.cfg_interval_end,
+            progress=progress,
         )
         # Check if generation failed
         if not result.get("success", False):
             return GenerationResult(
                 audios=[],
                 status_message=result.get("status_message", ""),
                 extra_outputs={},
                 success=False,
                 error=result.get("error"),
             )
         # Extract results from dit_handler.generate_music dict
         dit_audios = result.get("audios", [])
         status_message = result.get("status_message", "")
         dit_extra_outputs = result.get("extra_outputs", {})
         # Use the seed list already prepared above (from config.seed or params.seed fallback)
         # actual_seed_list was computed earlier using dit_handler.prepare_seeds
         seed_list = actual_seed_list
         # Get base params dictionary
         base_params_dict = params.to_dict()
         # Save audio files using AudioSaver (format from config)
         audio_format = config.audio_format if config.audio_format else "flac"
         audio_saver = AudioSaver(default_format=audio_format)
         # Use handler's temp_dir for saving files
         if save_dir is not None:
             os.makedirs(save_dir, exist_ok=True)
         # Build audios list for GenerationResult with params and save files
         # Audio saving and UUID generation handled here, outside of handler
         audios = []
         for idx, dit_audio in enumerate(dit_audios):
             # Create a copy of params dict for this audio
             audio_params = base_params_dict.copy()
             # Update audio-specific values
             audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
             # Add audio codes if batch mode
             if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
                 audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
             # Get audio tensor and metadata
             audio_tensor = dit_audio.get("tensor")
             sample_rate = dit_audio.get("sample_rate", 48000)
             # Generate UUID for this audio (moved from handler)
             batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (
+                lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
             if isinstance(audio_code_str, list):
                 audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(audio_params)
             # Save audio file (handled outside handler)
             audio_path = None
             if audio_tensor is not None and save_dir is not None:
                 try:
                     audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(audio_tensor,
+                                                        audio_file,
+                                                        sample_rate=sample_rate,
+                                                        format=audio_format,
+                                                        channels_first=True)
                 except Exception as e:
                     logger.error(f"[generate_music] Failed to save audio file: {e}")
                     audio_path = ""  # Fallback to empty path
             audio_dict = {
                 "path": audio_path or "",  # File path (saved here, not in handler)
                 "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
                 "sample_rate": sample_rate,
                 "params": audio_params,
             }
             audios.append(audio_dict)
         # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
         extra_outputs = dit_extra_outputs.copy()
         extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Merge time_costs from both LM and DiT into a unified dictionary
+        unified_time_costs = {}
+        # Add LM time costs (if LM was used)
+        if use_lm and lm_total_time_costs:
+            for key, value in lm_total_time_costs.items():
+                unified_time_costs[f"lm_{key}"] = value
+        # Add DiT time costs (if available)
+        dit_time_costs = dit_extra_outputs.get("time_costs", {})
+        if dit_time_costs:
+            for key, value in dit_time_costs.items():
+                unified_time_costs[f"dit_{key}"] = value
+        # Calculate total pipeline time
+        if unified_time_costs:
+            lm_total = unified_time_costs.get("lm_total_time", 0.0)
+            dit_total = unified_time_costs.get("dit_total_time_cost", 0.0)
+            unified_time_costs["pipeline_total_time"] = lm_total + dit_total
+        # Update extra_outputs with unified time_costs
+        extra_outputs["time_costs"] = unified_time_costs
+        if lm_status:
+            status_message = "\n".join(lm_status) + "\n" + status_message
+        else:
+            status_message = status_message
         # Create and return GenerationResult
         return GenerationResult(
             audios=audios,
             status_message=status_message,
             extra_outputs=extra_outputs,
             success=True,
             error=None,
         )
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
             audios=[],
             status_message=f"Error: {str(e)}",
             extra_outputs={},
             success=False,
             error=str(e),
         )

acestep/llm_inference.py CHANGED Viewed

@@ -5,6 +5,7 @@ Handles all LM-related operations including initialization and generation
 import os
 import traceback
 import time
 from typing import Optional, Dict, Any, Tuple, List, Union
 from contextlib import contextmanager
@@ -309,6 +310,7 @@ class LLMHandler:
             logger.info("loading 5Hz LM tokenizer...")
             start_time = time.time()
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
             logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
             self.llm_tokenizer = llm_tokenizer
@@ -796,12 +798,13 @@ class LLMHandler:
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
-        is_format_caption: bool = False,
         batch_size: Optional[int] = None,
         seeds: Optional[List[int]] = None,
-    ) -> Union[Tuple[Dict[str, Any], str, str], Tuple[List[Dict[str, Any]], List[str], str]]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
@@ -817,20 +820,30 @@ class LLMHandler:
             batch_size: Optional batch size for batch generation. If None or 1, returns single result.
                        If > 1, returns batch results (lists).
             seeds: Optional list of seeds for batch generation (for reproducibility).
-                  Only used when batch_size > 1.
         Returns:
-            If batch_size is None or 1: (metadata, audio_codes, status_msg)
-            If batch_size > 1: (metadata_list, audio_codes_list, status_msg)
         """
-        import time
-        import random
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
-            if batch_size and batch_size > 1:
-                return [], [], f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
-            return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
         # Determine if batch mode
         is_batch = batch_size and batch_size > 1
@@ -854,7 +867,8 @@ class LLMHandler:
         # ========== PHASE 1: CoT Generation ==========
         # Skip CoT if all metadata are user-provided OR caption is already formatted
-        if not has_all_metas and not is_format_caption:
             if is_batch:
                 logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
             else:
@@ -893,9 +907,13 @@ class LLMHandler:
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
-                if is_batch:
-                    return [], [], status
-                return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
@@ -915,11 +933,31 @@ class LLMHandler:
         if infer_type == "dit":
             if is_batch:
                 metadata_list = [metadata.copy() for _ in range(actual_batch_size)]
-                status_msg = f"✅ Generated CoT metadata successfully (batch mode)\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-                return metadata_list, [""] * actual_batch_size, status_msg
             else:
-                status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-                return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
         if is_batch:
@@ -935,6 +973,7 @@ class LLMHandler:
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
         if is_batch:
             # Batch mode: generate codes for all items
             formatted_prompts = [formatted_prompt_with_cot] * actual_batch_size
@@ -978,9 +1017,21 @@ class LLMHandler:
                         seeds=seeds,
                     )
             except Exception as e:
-                error_msg = f"❌ Error in batch codes generation: {str(e)}"
                 logger.error(error_msg)
-                return [], [], error_msg
             # Parse audio codes from each output
             audio_codes_list = []
@@ -996,8 +1047,22 @@ class LLMHandler:
             codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
             logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
-            status_msg = f"✅ Batch generation completed ({actual_batch_size} items)\nPhase 1: CoT metadata\nPhase 2: {sum(codes_counts)} total codes ({codes_counts})\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-            return metadata_list, audio_codes_list, status_msg
         else:
             # Single mode: generate codes for one item
             codes_output_text, status = self.generate_from_formatted_prompt(
@@ -1025,7 +1090,20 @@ class LLMHandler:
             )
             if not codes_output_text:
-                return metadata, "", status
             phase2_time = time.time() - phase2_start
@@ -1035,8 +1113,21 @@ class LLMHandler:
             codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
             logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
-            status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-            return metadata, audio_codes, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """

 import os
 import traceback
 import time
+import random
 from typing import Optional, Dict, Any, Tuple, List, Union
 from contextlib import contextmanager
             logger.info("loading 5Hz LM tokenizer...")
             start_time = time.time()
+            # TODO: load tokenizer too slow, not found solution yet
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
             logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
             self.llm_tokenizer = llm_tokenizer
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        use_cot_metas: bool = True,
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
         batch_size: Optional[int] = None,
         seeds: Optional[List[int]] = None,
+        progress=None,
+    ) -> Dict[str, Any]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
             batch_size: Optional batch size for batch generation. If None or 1, returns single result.
                        If > 1, returns batch results (lists).
             seeds: Optional list of seeds for batch generation (for reproducibility).
+                  Only used when batch_size > 1. TODO: not used yet
         Returns:
+            Dictionary containing:
+                - metadata: Dict or List[Dict] - Generated metadata
+                - audio_codes: str or List[str] - Generated audio codes
+                - success: bool - Whether generation succeeded
+                - error: Optional[str] - Error message if failed
+                - extra_outputs: Dict with time_costs and other info
         """
+        if progress is None:
+            def progress(*args, **kwargs):
+                pass
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
+            error_msg = f"invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
+            return {
+                "metadata": [] if (batch_size and batch_size > 1) else {},
+                "audio_codes": [] if (batch_size and batch_size > 1) else "",
+                "success": False,
+                "error": error_msg,
+                "extra_outputs": {"time_costs": {}},
+            }
         # Determine if batch mode
         is_batch = batch_size and batch_size > 1
         # ========== PHASE 1: CoT Generation ==========
         # Skip CoT if all metadata are user-provided OR caption is already formatted
+        progress(0.1, f"Phase 1: Generating CoT metadata (once for all items)...")
+        if not has_all_metas and use_cot_metas:
             if is_batch:
                 logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
             else:
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
+                return {
+                    "metadata": [] if is_batch else {},
+                    "audio_codes": [] if is_batch else "",
+                    "success": False,
+                    "error": status,
+                    "extra_outputs": {"time_costs": {"phase1_time": phase1_time}},
+                }
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
         if infer_type == "dit":
             if is_batch:
                 metadata_list = [metadata.copy() for _ in range(actual_batch_size)]
+                return {
+                    "metadata": metadata_list,
+                    "audio_codes": [""] * actual_batch_size,
+                    "success": True,
+                    "error": None,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
             else:
+                return {
+                    "metadata": metadata,
+                    "audio_codes": "",
+                    "success": True,
+                    "error": None,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
         # ========== PHASE 2: Audio Codes Generation ==========
         if is_batch:
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
+        progress(0.5, f"Phase 2: Generating audio codes for {actual_batch_size} items...")
         if is_batch:
             # Batch mode: generate codes for all items
             formatted_prompts = [formatted_prompt_with_cot] * actual_batch_size
                         seeds=seeds,
                     )
             except Exception as e:
+                error_msg = f"Error in batch codes generation: {str(e)}"
                 logger.error(error_msg)
+                return {
+                    "metadata": [],
+                    "audio_codes": [],
+                    "success": False,
+                    "error": error_msg,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "phase2_time": 0.0,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
             # Parse audio codes from each output
             audio_codes_list = []
             codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
             logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
+            total_time = phase1_time + phase2_time
+            return {
+                "metadata": metadata_list,
+                "audio_codes": audio_codes_list,
+                "success": True,
+                "error": None,
+                "extra_outputs": {
+                    "time_costs": {
+                        "phase1_time": phase1_time,
+                        "phase2_time": phase2_time,
+                        "total_time": total_time,
+                    },
+                    "codes_counts": codes_counts,
+                    "total_codes": sum(codes_counts),
+                },
+            }
         else:
             # Single mode: generate codes for one item
             codes_output_text, status = self.generate_from_formatted_prompt(
             )
             if not codes_output_text:
+                total_time = phase1_time + phase2_time
+                return {
+                    "metadata": metadata,
+                    "audio_codes": "",
+                    "success": False,
+                    "error": status,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "phase2_time": phase2_time,
+                            "total_time": total_time,
+                        }
+                    },
+                }
             phase2_time = time.time() - phase2_start
             codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
             logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
+            total_time = phase1_time + phase2_time
+            return {
+                "metadata": metadata,
+                "audio_codes": audio_codes,
+                "success": True,
+                "error": None,
+                "extra_outputs": {
+                    "time_costs": {
+                        "phase1_time": phase1_time,
+                        "phase2_time": phase2_time,
+                        "total_time": total_time,
+                    },
+                    "codes_count": codes_count,
+                },
+            }
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -93,6 +93,8 @@ class ModelRunner:
     def _allocate_sample_buffers(self):
         """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
         max_bs = self.config.max_num_seqs
         # Pre-allocate pinned memory buffers on CPU for fast transfer
         # Must explicitly specify device="cpu" since default device may be "cuda"
@@ -107,6 +109,19 @@ class ModelRunner:
         self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
         self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
         self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
     def exit(self):
         if self.world_size > 1:
@@ -227,7 +242,7 @@ class ModelRunner:
                 if i != seq.num_blocks - 1:
                     end = start + self.block_size
                 else:
-                    end = start + seq.last_block_num_tokens
                 slot_mapping.extend(list(range(start, end)))
         if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
             block_tables = self.prepare_block_tables(seqs)
@@ -269,19 +284,28 @@ class ModelRunner:
             target_seqs = seqs
         # Fill pre-allocated CPU buffers
         for i, seq in enumerate(target_seqs):
             self._cpu_temperatures[i] = seq.temperature
             self._cpu_cfg_scales[i] = seq.cfg_scale
             self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
             self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
             self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
         # Transfer to GPU using sliced views (single batched transfer)
         temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
         cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
-        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True)
-        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True)
-        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True)
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
@@ -309,27 +333,15 @@ class ModelRunner:
         [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
         where uncond_seqi is the paired unconditional sequence of cond_seqi."""
         # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
-        is_cfg_batch = False
-        if len(seqs) > 0:
-            # CFG batch if first sequence has cfg_scale > 1.0 and paired_seq
-            if seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
-                is_cfg_batch = True
-                # Verify batch structure: first half conditional, second half unconditional
-                num_cond = len(seqs) // 2
-                for i in range(num_cond):
-                    if seqs[i].is_unconditional or seqs[i + num_cond].is_unconditional == False:
-                        is_cfg_batch = False
-                        break
         if is_cfg_batch:
             # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
             num_cond = len(seqs) // 2
             cond_seqs = seqs[:num_cond]
-            uncond_seqs = seqs[num_cond:]
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
-            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
-                                   else self.prepare_decode(seqs))
             sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
             if sample_params is not None:
                 temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
@@ -380,7 +392,7 @@ class ModelRunner:
                         logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
                 # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
-                cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
                 # Sample from CFG logits
                 token_ids_cfg = self.sampler(
@@ -389,7 +401,7 @@ class ModelRunner:
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
-                    input_ids=cond_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling
@@ -448,7 +460,7 @@ class ModelRunner:
                         logits[i] = processed[0]
                 # Prepare input_ids for sampler
-                seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
                 token_ids = self.sampler(
                     logits,
@@ -456,7 +468,7 @@ class ModelRunner:
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
-                    input_ids=seq_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling

     def _allocate_sample_buffers(self):
         """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
         max_bs = self.config.max_num_seqs
+        max_tokens = self.config.max_num_batched_tokens
+        max_num_blocks = (self.config.max_model_len + self.block_size - 1) // self.block_size
         # Pre-allocate pinned memory buffers on CPU for fast transfer
         # Must explicitly specify device="cpu" since default device may be "cuda"
         self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
         self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
         self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate prefill buffers on CPU with pinned memory (optimization to avoid repeated tensor creation)
+        self._cpu_prefill_input_ids = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_positions = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_cu_seqlens = torch.zeros(max_bs + 1, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_prefill_slot_mapping = torch.zeros(max_tokens, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate block tables buffer (shared by both decode and prefill)
+        self._cpu_block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate buffer for sequence token IDs (used in logits processor and sampler)
+        # Max length is max_model_len since sequences can be that long
+        self._seq_token_ids_buffer = torch.zeros(max_bs, self.config.max_model_len, dtype=torch.int64, device="cpu", pin_memory=True)
     def exit(self):
         if self.world_size > 1:
                 if i != seq.num_blocks - 1:
                     end = start + self.block_size
                 else:
+                    end = start + seq.last_block_num_tokens
                 slot_mapping.extend(list(range(start, end)))
         if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
             block_tables = self.prepare_block_tables(seqs)
             target_seqs = seqs
         # Fill pre-allocated CPU buffers
+        top_ks_is_zero = True
+        top_ps_is_one = True
+        repetition_penalties_is_one = True
         for i, seq in enumerate(target_seqs):
             self._cpu_temperatures[i] = seq.temperature
             self._cpu_cfg_scales[i] = seq.cfg_scale
             self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
+            if seq.top_k is not None and seq.top_k > 0:
+                top_ks_is_zero = False
             self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
+            if seq.top_p is not None and seq.top_p == 1.0:
+                top_ps_is_one = False
             self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
+            if seq.repetition_penalty is not None and seq.repetition_penalty == 1.0:
+                repetition_penalties_is_one = False
         # Transfer to GPU using sliced views (single batched transfer)
         temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
         cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
+        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True) if not top_ks_is_zero else None
+        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True) if not top_ps_is_one else None
+        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True) if not repetition_penalties_is_one else None
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
         [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
         where uncond_seqi is the paired unconditional sequence of cond_seqi."""
         # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
+        is_cfg_batch = seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None
         if is_cfg_batch:
             # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
             num_cond = len(seqs) // 2
             cond_seqs = seqs[:num_cond]
+            # uncond_seqs = seqs[num_cond:]
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs))
             sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
             if sample_params is not None:
                 temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
                         logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
                 # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
+                # cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
                 # Sample from CFG logits
                 token_ids_cfg = self.sampler(
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
+                    # input_ids=cond_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling
                         logits[i] = processed[0]
                 # Prepare input_ids for sampler
+                # seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
                 token_ids = self.sampler(
                     logits,
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
+                    # input_ids=seq_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py CHANGED Viewed

@@ -85,6 +85,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
     def forward(
         self,
         logits: torch.Tensor,
@@ -102,27 +103,12 @@ class Sampler(nn.Module):
         """
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
-        # Check conditions OUTSIDE compiled code to avoid graph breaks
-        # These .any() calls cause CPU-GPU sync, but we do it once here
-        # instead of inside the compiled function
-        need_topk = top_ks is not None and bool((top_ks > 0).any()) and bool((top_ks < logits.shape[1]).any())
-        need_topp = top_ps is not None and bool((top_ps < 1.0).any()) and bool((top_ps > 0.0).any())
-        if need_topk or need_topp:
-            # Apply filtering (this part is not compiled due to dynamic control flow)
-            logits = apply_top_k_top_p(
-                logits,
-                top_ks if need_topk else None,
-                top_ps if need_topp else None,
-            )
-        # Sample using compiled function
-        return self._sample(logits)
-    @torch.compile(dynamic=True)
-    def _sample(self, logits: torch.Tensor) -> torch.Tensor:
-        """Compiled sampling kernel - no graph breaks here."""
-        probs = logits.softmax(dim=-1, dtype=torch.float32)
-        q = torch.empty_like(probs).exponential_()
-        return probs.div(q).argmax(dim=-1)

     def __init__(self):
         super().__init__()
+    @torch.compile
     def forward(
         self,
         logits: torch.Tensor,
         """
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        logits = apply_top_k_top_p(
+            logits,
+            top_ks,
+            top_ps,
+        )
+        probs = torch.softmax(logits, dim=-1)
+        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
+        return sample_tokens

profile_inference.py CHANGED Viewed

@@ -1,223 +1,682 @@
 #!/usr/bin/env python3
 """
-Profiling script for acestep/inference.py using cProfile
 Usage:
-    python profile_inference.py
-    python profile_inference.py --warmup
 """
-import cProfile
-import pstats
-import io
 import time
 import argparse
 import sys
 import os
 # Add project root to path
 project_root = os.path.abspath(os.path.dirname(__file__))
 if project_root not in sys.path:
     sys.path.insert(0, project_root)
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
-import json
-from typing import Tuple
-def profile_with_cprofile(dit_handler, llm_handler, params, config, warmup=False):
-    """Profile using Python's built-in cProfile.
-    Args:
-        warmup: If True, run once for warmup before profiling (default: False)
-    """
-    print("=" * 80)
-    print("Profiling with cProfile")
-    print("=" * 80)
-    # Warmup run (to exclude PyTorch compilation overhead)
-    if warmup:
-        print("\n[Warmup] Running first generation to warm up (PyTorch compilation, etc.)...")
-        warmup_start = time.time()
-        params.use_cot_metas = False
-        config.is_format_caption = True
-        config.use_constrained_decoding = False
-        warmup_result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
-        warmup_time = time.time() - warmup_start
-        print(f"[Warmup] Completed in {warmup_time:.2f}s")
-        if not warmup_result.success:
-            print(f"[Warmup] ⚠ Warmup generation failed: {warmup_result.error}")
-            return warmup_result
-    # Actual profiling run (first inference)
-    print("\n[Profiling] Running first generation for profiling...")
-    profiler = cProfile.Profile()
-    profiler.enable()
-    profiling_start = time.time()
-    try:
-        result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
-    finally:
-        profiler.disable()
-    profiling_time = time.time() - profiling_start
-    # Create stats
-    s = io.StringIO()
-    ps = pstats.Stats(profiler, stream=s)
-    ps.sort_stats('cumulative')
-    print(f"\n[Profiling] Completed in {profiling_time:.2f}s")
-    print("\nTop 30 functions by cumulative time:")
-    print("-" * 80)
-    ps.print_stats(30)
-    print("\nTop 30 functions by total time:")
-    print("-" * 80)
-    ps.sort_stats('tottime')
-    ps.print_stats(30)
-    # Save detailed report to file
-    output_file = "profile_cprofile.txt"
-    with open(output_file, 'w') as f:
-        # Create a new Stats object with file as stream
-        ps_file = pstats.Stats(profiler, stream=f)
-        ps_file.sort_stats('cumulative')
-        ps_file.print_stats()
-    print(f"\nDetailed profile saved to: {output_file}")
-    return result
-def main():
-    parser = argparse.ArgumentParser(description="Profile acestep/inference.py")
-    parser.add_argument(
-        "--checkpoint-dir",
-        type=str,
-        default="./checkpoints",
-        help="Path to checkpoints directory"
-    )
-    parser.add_argument(
-        "--config-path",
-        type=str,
-        default="acestep-v15-turbo-rl",
-        help="Model config path"
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        help="Device to use (cuda/cpu)"
     )
-    parser.add_argument(
-        "--lm-model",
-        type=str,
-        default="acestep-5Hz-lm-0.6B-v3",
-        help="LM model path"
     )
-    parser.add_argument(
-        "--lm-backend",
-        type=str,
-        default="vllm",
-        help="LM backend"
     )
-    parser.add_argument(
-        "--warmup",
-        action="store_true",
-        help="Enable warmup run before profiling (default: False, profile first run)"
     )
     args = parser.parse_args()
-    # Initialize handlers
-    print("Initializing handlers...")
     dit_handler = AceStepHandler()
     llm_handler = LLMHandler()
-    # Initialize DiT
-    print("  - Initializing DiT model...")
     status_dit, success_dit = dit_handler.initialize_service(
         project_root=project_root,
         config_path=args.config_path,
         device=args.device,
     )
     if not success_dit:
-        print(f"  ❌ DiT initialization failed: {status_dit}")
         sys.exit(1)
-    print("  ✓ DiT model initialized")
-    # Initialize LLM
-    print("  - Initializing LLM model...")
-    status_llm, success_llm = llm_handler.initialize(
-        checkpoint_dir=args.checkpoint_dir,
-        lm_model_path=args.lm_model,
-        backend=args.lm_backend,
-        device=args.device,
-    )
-    if success_llm:
-        print("  ✓ LM model initialized")
-    else:
-        print(f"  ⚠ LM initialization failed: {status_llm}")
-    # Load test parameters from example file (same as acestep/inference.py)
-    def load_example_config(example_file: str) -> Tuple[GenerationParams, GenerationConfig]:
-        """Load configuration from an example JSON file."""
-        try:
-            with open(example_file, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            # Convert example format to GenerationParams and GenerationConfig
-            # Handle time signature format (example uses "4" instead of "4/4")
-            time_sig = data.get('timesignature', '')
-            params = GenerationParams(
-                caption=data.get('caption', ''),
-                lyrics=data.get('lyrics', ''),
-                bpm=data.get('bpm'),
-                keyscale=data.get('keyscale', ''),
-                timesignature=time_sig,
-                vocal_language=data.get('language', 'unknown'),
-                duration=data.get('duration'),
-                thinking=data.get('think', False),
-                inference_steps=data.get('inference_steps', 8),
-                seed=42,
-            )
-            config = GenerationConfig()
-            config.batch_size = data.get('batch_size', 1)
-            return params, config
-        except Exception as e:
-            print(f"  ⚠ Failed to load example file: {e}")
-            return None, None
-    # Load production example (same as acestep/inference.py)
-    example_file = os.path.join(project_root, "examples", "text2music", "example_05.json")
     if not os.path.exists(example_file):
-        print(f"\n  ❌ Example file not found: {example_file}")
-        print("     Please ensure the examples directory exists.")
         sys.exit(1)
-    print(f"\n  Loading example: {os.path.basename(example_file)}")
     params, config = load_example_config(example_file)
     if not params or not config:
-        print("  ❌ Failed to load example configuration")
         sys.exit(1)
-    print("\n" + "=" * 80)
-    print("Starting profiling...")
-    print("=" * 80)
-    result = profile_with_cprofile(dit_handler, llm_handler, params, config, warmup=args.warmup)
-    if result and not result.success:
-        print(f"\n⚠ Generation failed: {result.error}")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python3
 """
+Enhanced profiling script for ACE-Step inference with deep LLM analysis
+This script helps diagnose why LLM generation is slow by tracking:
+1. Total tokens generated vs expected throughput (200 tokens/sec baseline)
+2. Per-iteration timing to detect compilation overhead or slow operations
+3. Constrained decoding overhead
+4. CFG overhead (2x forward passes)
+5. Model forward time vs sampling/processing time
 Usage:
+    python profile_inference.py                    # Standard profiling with warmup
+    python profile_inference.py --no-warmup        # Profile first run (includes compilation)
+    python profile_inference.py --llm-debug        # Deep LLM performance debugging
+    python profile_inference.py --detailed         # Add cProfile function-level analysis
+    Inference mode options:
+    python profile_inference.py --thinking                        # Enable CoT for code generation
+    python profile_inference.py --use-constrained-decoding        # Use FSM constrained decoding
+    python profile_inference.py --use-cot-metas                  # Enable LM to generate metadata via CoT
 """
 import time
 import argparse
 import sys
 import os
+from contextlib import contextmanager
+from collections import defaultdict
+import json
+from typing import Tuple, Dict, Any, List
+from functools import wraps
 # Add project root to path
 project_root = os.path.abspath(os.path.dirname(__file__))
 if project_root not in sys.path:
     sys.path.insert(0, project_root)
+import torch
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
+class PreciseTimer:
+    """High-precision timer with CUDA synchronization for accurate GPU timing"""
+    def __init__(self, device="cuda"):
+        self.device = device
+        self.timings = defaultdict(list)
+        self.enabled = True
+    def sync(self):
+        """Synchronize CUDA operations for accurate timing"""
+        if self.enabled and self.device.startswith("cuda") and torch.cuda.is_available():
+            torch.cuda.synchronize()
+    @contextmanager
+    def time(self, name: str):
+        """Time a code section with CUDA synchronization"""
+        if not self.enabled:
+            yield
+            return
+        self.sync()
+        start = time.perf_counter()
+        try:
+            yield
+        finally:
+            self.sync()
+            elapsed = time.perf_counter() - start
+            self.timings[name].append(elapsed)
+    def get_total(self, name: str) -> float:
+        """Get total accumulated time for a section"""
+        return sum(self.timings.get(name, []))
+    def get_mean(self, name: str) -> float:
+        """Get mean time per call for a section"""
+        times = self.timings.get(name, [])
+        return sum(times) / len(times) if times else 0.0
+    def get_count(self, name: str) -> int:
+        """Get number of calls for a section"""
+        return len(self.timings.get(name, []))
+    def get_all(self, name: str) -> List[float]:
+        """Get all timing samples for a section"""
+        return self.timings.get(name, [])
+class LLMDebugger:
+    """Track detailed LLM performance metrics to diagnose slow generation"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        """Reset all metrics"""
+        self.total_tokens = 0
+        self.generation_start = None
+        self.generation_end = None
+        self.output_text = ""
+        self.prompt_length = 0
+    def start(self, prompt_length: int = 0):
+        """Mark generation start"""
+        self.generation_start = time.perf_counter()
+        self.prompt_length = prompt_length
+    def end(self, output_text: str = ""):
+        """Mark generation end and store output"""
+        self.generation_end = time.perf_counter()
+        self.output_text = output_text
+    def set_token_count(self, count: int):
+        """Set total token count"""
+        self.total_tokens = count
+    def get_throughput(self) -> float:
+        """Calculate actual tokens per second"""
+        if self.generation_start and self.generation_end and self.total_tokens > 0:
+            total_time = self.generation_end - self.generation_start
+            if total_time > 0:
+                return self.total_tokens / total_time
+        return 0.0
+    def print_analysis(self):
+        """Print detailed LLM performance analysis"""
+        if not self.generation_start or not self.generation_end:
+            return
+        print("\n" + "=" * 100)
+        print("🔍 LLM PERFORMANCE DEEP DIVE")
+        print("=" * 100)
+        total_time = self.generation_end - self.generation_start
+        throughput = self.get_throughput()
+        # Basic metrics table
+        print(f"\n{'Metric':<40} {'Value':<20} {'Notes'}")
+        print("-" * 100)
+        print(f"{'Total Tokens Generated:':<40} {self.total_tokens:<20} (new tokens only)")
+        print(f"{'Prompt Length (estimate):':<40} {self.prompt_length:<20} (input tokens)")
+        print(f"{'Total Generation Time:':<40} {total_time:<20.3f} seconds")
+        print(f"{'Measured Throughput:':<40} {throughput:<20.1f} tokens/sec")
+        print(f"{'Expected Throughput:':<40} {'200':<20} tokens/sec (baseline)")
+        # Calculate performance gap
+        if throughput > 0:
+            slowdown = 200.0 / throughput
+            efficiency = (throughput / 200.0) * 100
+            print(f"{'Performance vs Baseline:':<40} {efficiency:<20.1f}% of expected")
+            print(f"{'Slowdown Factor:':<40} {slowdown:<20.2f}x slower")
+        # Analyze generated output
+        if self.output_text:
+            print(f"\n{'Output Analysis:':<40}")
+            print(f"{'  Output length:':<40} {len(self.output_text):<20} characters")
+            # Count audio codes
+            import re
+            code_pattern = r'<\|audio_code_\d+\|>'
+            codes = re.findall(code_pattern, self.output_text)
+            if codes:
+                print(f"{'  Audio codes generated:':<40} {len(codes):<20} codes")
+                print(f"{'  Expected audio duration:':<40} {f'~{len(codes)/5:.1f}s':<20} (5 codes per second)")
+                if total_time > 0:
+                    print(f"{'  Time per audio code:':<40} {f'{total_time/len(codes)*1000:.1f}ms':<20}")
+            # Check for CoT section
+            if '<think>' in self.output_text and '</think>' in self.output_text:
+                cot_start = self.output_text.find('<think>')
+                cot_end = self.output_text.find('</think>') + 8
+                cot_section = self.output_text[cot_start:cot_end]
+                cot_token_est = len(cot_section) // 4
+                print(f"{'  CoT section tokens (estimate):':<40} {f'~{cot_token_est}':<20}")
+        # Diagnostic guidance
+        print("\n" + "=" * 100)
+        print("🔧 DIAGNOSTIC GUIDANCE")
+        print("=" * 100)
+        if throughput < 50:
+            print("\n⚠️  CRITICAL: Throughput is extremely low (<50 tokens/sec)")
+            print("\nThis is ~4x slower than expected. Likely causes:")
+            print("  1. ❗ Constrained decoding FSM overhead")
+            print("     → Each token triggers FSM state machine validation")
+            print("     → Try: set use_constrained_decoding=False in config")
+            print("  2. ❗ CFG with double forward passes")
+            print("     → cfg_scale > 1.0 means running model twice per token")
+            print("     → Check: params.lm_cfg_scale value")
+            print("  3. ❗ Running in eager mode without compilation")
+            print("     → PyTorch should compile kernels after warmup")
+            print("     → Check: torch._dynamo.config settings")
+        elif throughput < 100:
+            print("\n⚠️  WARNING: Throughput is low (50-100 tokens/sec)")
+            print("\nLikely causes:")
+            print("  1. Constrained decoding overhead (~30-50% slowdown expected)")
+            print("  2. CFG enabled (2x compute per token if cfg_scale > 1.0)")
+            print("  3. Small model or inefficient GPU utilization")
+        elif throughput < 150:
+            print("\n⚠️  Throughput is below baseline but acceptable (100-150 tokens/sec)")
+            print("\nMinor overhead from:")
+            print("  - Constrained decoding: ~20-30% overhead")
+            print("  - Profiling instrumentation: ~5-10% overhead")
+        else:
+            print(f"\n✓ Throughput is good ({throughput:.1f} tokens/sec)")
+            print("  Performance is within acceptable range")
+# Global instances
+timer = None
+llm_debugger = None
+def wrap_method_with_timing(obj, method_name: str, timing_key: str):
+    """Wrap a method with timing instrumentation"""
+    original_method = getattr(obj, method_name)
+    @wraps(original_method)
+    def timed_wrapper(*args, **kwargs):
+        with timer.time(timing_key):
+            return original_method(*args, **kwargs)
+    setattr(obj, method_name, timed_wrapper)
+    return original_method
+def wrap_llm_with_debug_tracking(llm_handler):
+    """Wrap LLM generation with detailed performance tracking"""
+    original_method = llm_handler.generate_with_stop_condition
+    @wraps(original_method)
+    def debug_wrapper(*args, **kwargs):
+        # Estimate prompt length
+        caption = kwargs.get('caption', args[0] if len(args) > 0 else "")
+        lyrics = kwargs.get('lyrics', args[1] if len(args) > 1 else "")
+        prompt_estimate = len(caption) + len(lyrics)
+        prompt_tokens_estimate = prompt_estimate // 4
+        # Start tracking
+        llm_debugger.reset()
+        llm_debugger.start(prompt_length=prompt_tokens_estimate)
+        # Call original with timing
+        with timer.time('llm_inference'):
+            result = original_method(*args, **kwargs)
+        # Extract and analyze output
+        output_text = ""
+        if isinstance(result, tuple) and len(result) >= 2:
+            if isinstance(result[1], list):
+                # Batch mode
+                output_text = "".join(result[1])
+            else:
+                # Single mode
+                cot_output = ""
+                if isinstance(result[0], dict):
+                    for v in result[0].values():
+                        if isinstance(v, str):
+                            cot_output += v
+                output_text = cot_output + str(result[1])
+        # Count tokens
+        import re
+        code_pattern = r'<\|audio_code_\d+\|>'
+        codes = re.findall(code_pattern, output_text)
+        remaining_text = re.sub(code_pattern, '', output_text)
+        cot_tokens_estimate = len(remaining_text) // 4
+        total_tokens = len(codes) + cot_tokens_estimate
+        llm_debugger.set_token_count(total_tokens)
+        llm_debugger.end(output_text)
+        return result
+    llm_handler.generate_with_stop_condition = debug_wrapper
+    return original_method
+def instrument_handlers(dit_handler, llm_handler, enable_llm_debug=False):
+    """Add timing instrumentation to handler methods"""
+    originals = {}
+    # Instrument LLM
+    if llm_handler and llm_handler.llm_initialized:
+        if enable_llm_debug:
+            originals['llm_generate'] = wrap_llm_with_debug_tracking(llm_handler)
+        else:
+            originals['llm_generate'] = wrap_method_with_timing(
+                llm_handler, 'generate_with_stop_condition', 'llm_inference'
+            )
+    # Instrument DiT handler
+    originals['dit_prepare'] = wrap_method_with_timing(
+        dit_handler, 'prepare_batch_data', 'prepare_batch_data'
     )
+    originals['dit_generate'] = wrap_method_with_timing(
+        dit_handler, 'service_generate', 'dit_inference'
     )
+    originals['dit_decode'] = wrap_method_with_timing(
+        dit_handler, 'tiled_decode', 'vae_decode'
     )
+    return originals
+def restore_handlers(dit_handler, llm_handler, originals):
+    """Restore original handler methods after profiling"""
+    if llm_handler and 'llm_generate' in originals:
+        llm_handler.generate_with_stop_condition = originals['llm_generate']
+    dit_handler.prepare_batch_data = originals['dit_prepare']
+    dit_handler.service_generate = originals['dit_generate']
+    dit_handler.tiled_decode = originals['dit_decode']
+def print_profiling_results(total_time: float, show_llm_debug: bool = False):
+    """Print comprehensive profiling results with performance insights"""
+    print("\n" + "=" * 100)
+    print("🎯 PROFILING RESULTS")
+    print("=" * 100)
+    # Define timing categories
+    model_sections = {
+        'llm_inference': 'LLM Inference (5Hz Language Model)',
+        'dit_inference': 'DiT Inference (Diffusion Transformer)',
+        'vae_decode': 'VAE Decode (Audio Decoder)',
+    }
+    non_model_sections = {
+        'prepare_batch_data': 'Prepare Batch Data (embedding, formatting)',
+    }
+    # Calculate totals
+    model_time = sum(timer.get_total(k) for k in model_sections.keys())
+    non_model_time = sum(timer.get_total(k) for k in non_model_sections.keys())
+    other_time = total_time - model_time - non_model_time
+    # Print summary table
+    print(f"\n{'CATEGORY':<50} {'TIME (s)':<12} {'%':<8} {'CALLS':<8}")
+    print("-" * 100)
+    # Model time breakdown
+    print(f"\n{'🤖 MODEL TIME (Total)':<50} {model_time:<12.3f} {100*model_time/total_time:>6.1f}% {'':<8}")
+    for key, desc in model_sections.items():
+        t = timer.get_total(key)
+        c = timer.get_count(key)
+        if c > 0:
+            mean = timer.get_mean(key)
+            pct = 100 * t / total_time
+            print(f"  {'├─ ' + desc:<48} {t:<12.3f} {pct:>6.1f}% {c:<8} (avg: {mean:.3f}s)")
+    # Non-model time breakdown
+    print(f"\n{'⚙️  NON-MODEL TIME (Total)':<50} {non_model_time:<12.3f} {100*non_model_time/total_time:>6.1f}% {'':<8}")
+    for key, desc in non_model_sections.items():
+        t = timer.get_total(key)
+        c = timer.get_count(key)
+        if c > 0:
+            mean = timer.get_mean(key)
+            pct = 100 * t / total_time
+            print(f"  {'├─ ' + desc:<48} {t:<12.3f} {pct:>6.1f}% {c:<8} (avg: {mean:.3f}s)")
+    # Other time
+    if other_time > 0.01:
+        pct = 100 * other_time / total_time
+        print(f"\n{'📦 OTHER TIME (I/O, overhead, audio save)':<50} {other_time:<12.3f} {pct:>6.1f}% {'':<8}")
+    print(f"\n{'📊 TOTAL TIME':<50} {total_time:<12.3f} {'100.0%':>6} {'':<8}")
+    # Show LLM detailed analysis if enabled
+    if show_llm_debug:
+        llm_debugger.print_analysis()
+    # Performance insights
+    print("\n" + "=" * 100)
+    print("💡 PERFORMANCE INSIGHTS")
+    print("=" * 100)
+    llm_t = timer.get_total('llm_inference')
+    dit_t = timer.get_total('dit_inference')
+    vae_t = timer.get_total('vae_decode')
+    prep_t = timer.get_total('prepare_batch_data')
+    # Model time insights
+    if model_time > 0:
+        print(f"\n✓ Model operations: {model_time:.3f}s ({100*model_time/total_time:.1f}% of total)")
+        if llm_t > 0:
+            print(f"  - LLM: {llm_t:.3f}s ({100*llm_t/model_time:.1f}% of model time)")
+        if dit_t > 0:
+            print(f"  - DiT: {dit_t:.3f}s ({100*dit_t/model_time:.1f}% of model time)")
+        if vae_t > 0:
+            print(f"  - VAE: {vae_t:.3f}s ({100*vae_t/model_time:.1f}% of model time)")
+    # LLM bottleneck analysis
+    if llm_t > dit_t and llm_t > 5.0:
+        print(f"\n⚠️  LLM IS THE BOTTLENECK: {llm_t:.3f}s ({100*llm_t/total_time:.1f}% of total)")
+        print(f"\n   Possible causes:")
+        print(f"   1. Generating too many tokens → use --llm-debug to verify")
+        print(f"   2. Constrained decoding overhead → FSM validation per token")
+        print(f"   3. CFG overhead → cfg_scale > 1.0 = 2x forward passes")
+        print(f"   4. First-token latency → warmup should help")
+        print(f"   5. KV cache inefficiency → should be ~5-10ms/token")
+    # Non-model insights
+    if non_model_time / total_time > 0.1:
+        print(f"\n⚠️  Non-model operations: {non_model_time:.3f}s ({100*non_model_time/total_time:.1f}%)")
+        if prep_t > 0.1:
+            print(f"   - Batch preparation: {prep_t:.3f}s")
+    # I/O overhead
+    if other_time / total_time > 0.2:
+        print(f"\n⚠️  Overhead/I/O: {other_time:.3f}s ({100*other_time/total_time:.1f}%)")
+    # Recommendations
+    print("\n" + "=" * 100)
+    print("🚀 OPTIMIZATION RECOMMENDATIONS")
+    print("=" * 100)
+    if llm_t > dit_t * 2:
+        print("\n🎯 Priority: Optimize LLM")
+        print("  1. Run: python profile_inference.py --llm-debug")
+        print("     → Shows exact token count and throughput")
+        print("  2. Check constrained decoding overhead")
+        print("  3. Check CFG scaling (lm_cfg_scale parameter)")
+        print("  4. Profile nanovllm engine step() timing")
+        print("  5. Compare vllm vs transformers backends")
+def run_profiled_generation(dit_handler, llm_handler, params, config,
+                           enable_cprofile=False, enable_llm_debug=False):
+    """Execute generation with full profiling instrumentation"""
+    # Instrument handlers
+    originals = instrument_handlers(dit_handler, llm_handler, enable_llm_debug)
+    try:
+        print("\n[Profiling] Starting generation...")
+        timer.sync()
+        total_start = time.perf_counter()
+        # Optional cProfile
+        prof = None
+        if enable_cprofile:
+            import cProfile
+            prof = cProfile.Profile()
+            prof.enable()
+        # Run generation
+        result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
+        # Stop timing
+        timer.sync()
+        total_time = time.perf_counter() - total_start
+        # Save cProfile if enabled
+        if enable_cprofile and prof:
+            prof.disable()
+            import pstats
+            import io
+            output_file = "profile_cprofile_detailed.txt"
+            with open(output_file, 'w') as f:
+                ps = pstats.Stats(prof, stream=f)
+                ps.sort_stats('cumulative')
+                ps.print_stats(100)
+            # Print top functions
+            print("\n" + "=" * 100)
+            print("📊 TOP 20 FUNCTIONS BY CUMULATIVE TIME (cProfile)")
+            print("=" * 100)
+            s = io.StringIO()
+            ps = pstats.Stats(prof, stream=s)
+            ps.sort_stats('cumulative')
+            ps.print_stats(20)
+            print(s.getvalue())
+            print(f"\nFull report: {output_file}")
+        # Print results
+        print_profiling_results(total_time, show_llm_debug=enable_llm_debug)
+        return result, total_time
+    finally:
+        restore_handlers(dit_handler, llm_handler, originals)
+def load_example_config(example_file: str) -> Tuple[GenerationParams, GenerationConfig]:
+    """Load configuration from example JSON file"""
+    try:
+        with open(example_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        params = GenerationParams(
+            caption=data.get('caption', ''),
+            lyrics=data.get('lyrics', ''),
+            bpm=data.get('bpm'),
+            keyscale=data.get('keyscale', ''),
+            timesignature=data.get('timesignature', ''),
+            vocal_language=data.get('language', 'unknown'),
+            duration=data.get('duration'),
+            thinking=data.get('think', False),
+            inference_steps=data.get('inference_steps', 8),
+            seed=data.get('seed', 42),
+        )
+        config = GenerationConfig(batch_size=data.get('batch_size', 1), seeds=[42])
+        return params, config
+    except Exception as e:
+        print(f"  ❌ Failed to load: {e}")
+        return None, None
+def main():
+    global timer, llm_debugger
+    parser = argparse.ArgumentParser(
+        description="Profile ACE-Step inference with LLM debugging"
     )
+    parser.add_argument("--checkpoint-dir", type=str, default="./checkpoints")
+    parser.add_argument("--config-path", type=str, default="acestep-v15-turbo-rl")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--lm-model", type=str, default="acestep-5Hz-lm-0.6B-v3")
+    parser.add_argument("--lm-backend", type=str, default="vllm")
+    parser.add_argument("--no-warmup", action="store_true")
+    parser.add_argument("--detailed", action="store_true")
+    parser.add_argument("--llm-debug", action="store_true",
+                       help="Enable deep LLM debugging (token count, throughput)")
+    parser.add_argument("--example", type=str, default="example_05.json")
+    # Inference mode parameters
+    parser.add_argument("--thinking", action="store_true",
+                       help="Enable CoT reasoning for LM to generate audio codes")
+    parser.add_argument("--use-constrained-decoding", action="store_true",
+                       help="Use FSM-based constrained decoding for meta generation")
+    parser.add_argument("--use-cot-metas", action="store_true",
+                       help="Enable LLM to generate music metadata via CoT reasoning")
     args = parser.parse_args()
+    # Initialize
+    timer = PreciseTimer(device=args.device)
+    llm_debugger = LLMDebugger()
+    print("=" * 100)
+    print("🎵 ACE-Step Inference Profiler (LLM Performance Analysis)")
+    print("=" * 100)
+    print(f"\nConfiguration:")
+    print(f"  Device: {args.device}")
+    print(f"  LLM Backend: {args.lm_backend}")
+    print(f"  LLM Debug: {'Enabled' if args.llm_debug else 'Disabled'}")
+    print(f"  Warmup: {'Disabled' if args.no_warmup else 'Enabled'}")
+    print(f"\nInference Mode:")
+    print(f"  Thinking (CoT): {'Enabled' if args.thinking else 'Disabled'}")
+    print(f"  Constrained Decoding: {'Enabled' if args.use_constrained_decoding else 'Disabled'}")
+    print(f"  Use CoT for Metas: {'Enabled' if args.use_cot_metas else 'Disabled'}")
+    # Initialize models
+    print(f"\nInitializing models...")
     dit_handler = AceStepHandler()
     llm_handler = LLMHandler()
+    print("  🎹 Initializing DiT...")
     status_dit, success_dit = dit_handler.initialize_service(
         project_root=project_root,
         config_path=args.config_path,
         device=args.device,
+        use_flash_attention=True,
     )
     if not success_dit:
+        print(f"  ❌ Failed: {status_dit}")
         sys.exit(1)
+    print(f"     ✓ DiT ready")
+    print("  🧠 Initializing LLM...")
+    if args.thinking or args.use_cot_metas:
+        status_llm, success_llm = llm_handler.initialize(
+            checkpoint_dir=args.checkpoint_dir,
+            lm_model_path=args.lm_model,
+            backend=args.lm_backend,
+            device=args.device,
+        )
+        if success_llm:
+            print(f"     ✓ LLM ready ({args.lm_backend})")
+        else:
+            print(f"     ⚠ Failed: {status_llm}")
+    else:
+        print(f"     ✓ LLM not initialized (thinking or use_cot_metas is disabled)")
+    # Load example
+    example_file = os.path.join(project_root, "examples", "text2music", args.example)
     if not os.path.exists(example_file):
+        print(f"\n❌ Not found: {example_file}")
         sys.exit(1)
+    print(f"\n📄 Loading: {args.example}")
     params, config = load_example_config(example_file)
     if not params or not config:
+        print("❌ Failed to load config")
         sys.exit(1)
+    print(f"   Caption: {params.caption[:60]}...")
+    print(f"   Batch: {config.batch_size}, Steps: {params.inference_steps}, LLM: {params.thinking}")
+    # Warmup
+    if not args.no_warmup:
+        print("\n" + "=" * 100)
+        print("🔥 WARMUP RUN")
+        print("=" * 100)
+        warmup_params = GenerationParams(
+            caption=params.caption,
+            lyrics=params.lyrics,
+            bpm=params.bpm,
+            keyscale=params.keyscale,
+            timesignature=params.timesignature,
+            vocal_language=params.vocal_language,
+            duration=params.duration,
+            thinking=args.thinking,
+            use_cot_metas=args.use_cot_metas,
+            inference_steps=params.inference_steps,
+            seed=params.seed,
+        )
+        warmup_config = GenerationConfig(batch_size=1, seeds=[42])
+        warmup_config.use_constrained_decoding = args.use_constrained_decoding
+        warmup_start = time.perf_counter()
+        warmup_result = generate_music(dit_handler, llm_handler, warmup_params, warmup_config, save_dir="./")
+        warmup_time = time.perf_counter() - warmup_start
+        print(f"\n✓ Warmup: {warmup_time:.2f}s")
+        if not warmup_result.success:
+            print(f"⚠️  Warning: {warmup_result.error}")
+        # Reset
+        timer = PreciseTimer(device=args.device)
+        llm_debugger = LLMDebugger()
+    # Profiling run
+    print("\n" + "=" * 100)
+    print("⏱️  PROFILING RUN")
+    print("=" * 100)
+    # Apply inference mode settings
+    config.use_constrained_decoding = args.use_constrained_decoding
+    # Override thinking and use_cot_metas parameters if specified via CLI
+    if args.thinking:
+        params.thinking = True
+    if args.use_cot_metas:
+        params.use_cot_metas = True
+    result, total_time = run_profiled_generation(
+        dit_handler, llm_handler, params, config,
+        enable_cprofile=args.detailed,
+        enable_llm_debug=args.llm_debug
+    )
+    if not result.success:
+        print(f"\n❌ Failed: {result.error}")
+        sys.exit(1)
+    print(f"\n✅ Success! Generated {len(result.audios)} audio file(s)")
+    # Final tips
+    if args.detailed:
+        print("\n💡 Check profile_cprofile_detailed.txt for function-level analysis")
+    elif not args.llm_debug:
+        print("\n💡 Run with --llm-debug to see LLM token count and throughput analysis")
 if __name__ == "__main__":
     main()