Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

Gong Junmin commited on Jan 11

Commit

77c327b

unverified ·

2 Parent(s): 24f370e ba6c5ba

Merge pull request #1 from ace-step/refact_add_inference

Browse files

Files changed (14) hide show

acestep/api_server.py +204 -458
acestep/audio_utils.py +320 -0
acestep/constrained_logits_processor.py +76 -97
acestep/gradio_ui/event.py +0 -0
acestep/gradio_ui/events/__init__.py +78 -41
acestep/gradio_ui/events/results_handlers.py +355 -461
acestep/gradio_ui/interfaces/result.py +16 -8
acestep/handler.py +328 -318
acestep/inference.py +477 -785
acestep/llm_inference.py +640 -603
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +92 -64
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +87 -47
acestep/third_parts/nano-vllm/pyproject.toml +0 -2
profile_inference.py +682 -0

acestep/api_server.py CHANGED Viewed

@@ -44,6 +44,12 @@ from acestep.constants import (
     DEFAULT_DIT_INSTRUCTION,
     DEFAULT_LM_INSTRUCTION,
 )
 JobStatus = Literal["queued", "running", "succeeded", "failed"]
@@ -387,6 +393,10 @@ def create_app() -> FastAPI:
         app.state.executor = executor
         app.state.job_store = store
         app.state._python_executable = sys.executable
         async def _ensure_initialized() -> None:
             h: AceStepHandler = app.state.handler
@@ -443,131 +453,10 @@ def create_app() -> FastAPI:
             job_store.mark_running(job_id)
             def _blocking_generate() -> Dict[str, Any]:
-                def _normalize_optional_int(v: Any) -> Optional[int]:
-                    if v is None:
-                        return None
-                    try:
-                        iv = int(v)
-                    except Exception:
-                        return None
-                    return None if iv == 0 else iv
-                def _normalize_optional_float(v: Any) -> Optional[float]:
-                    if v is None:
-                        return None
-                    try:
-                        fv = float(v)
-                    except Exception:
-                        return None
-                    # gradio treats 1.0 as disabled for top_p
-                    return None if fv >= 1.0 else fv
-                def _maybe_fill_from_metadata(current: GenerateMusicRequest, meta: Dict[str, Any]) -> tuple[Optional[int], str, str, Optional[float]]:
-                    def _parse_first_float(v: Any) -> Optional[float]:
-                        if v is None:
-                            return None
-                        if isinstance(v, (int, float)):
-                            return float(v)
-                        s = str(v).strip()
-                        if not s or s.upper() == "N/A":
-                            return None
-                        try:
-                            return float(s)
-                        except Exception:
-                            pass
-                        m = re.search(r"[-+]?\d*\.?\d+", s)
-                        if not m:
-                            return None
-                        try:
-                            return float(m.group(0))
-                        except Exception:
-                            return None
-                    def _parse_first_int(v: Any) -> Optional[int]:
-                        fv = _parse_first_float(v)
-                        if fv is None:
-                            return None
-                        try:
-                            return int(round(fv))
-                        except Exception:
-                            return None
-                    # Fill only when user did not provide values
-                    bpm_val = current.bpm
-                    if bpm_val is None:
-                        m = meta.get("bpm")
-                        parsed = _parse_first_int(m)
-                        if parsed is not None and parsed > 0:
-                            bpm_val = parsed
-                    key_scale_val = current.key_scale
-                    if not key_scale_val:
-                        m = meta.get("keyscale", meta.get("key_scale", ""))
-                        if m not in (None, "", "N/A"):
-                            key_scale_val = str(m)
-                    time_sig_val = current.time_signature
-                    if not time_sig_val:
-                        m = meta.get("timesignature", meta.get("time_signature", ""))
-                        if m not in (None, "", "N/A"):
-                            time_sig_val = str(m)
-                    dur_val = current.audio_duration
-                    if dur_val is None:
-                        m = meta.get("duration", meta.get("audio_duration"))
-                        parsed = _parse_first_float(m)
-                        if parsed is not None:
-                            dur_val = float(parsed)
-                            if dur_val <= 0:
-                                dur_val = None
-                        # Avoid truncating lyrical songs when LM predicts a very short duration.
-                        # (Users can still force a short duration by explicitly setting `audio_duration`.)
-                        if dur_val is not None and (current.lyrics or "").strip():
-                            min_dur = float(os.getenv("ACESTEP_LM_MIN_DURATION_SECONDS", "30"))
-                            if dur_val < min_dur:
-                                dur_val = None
-                    return bpm_val, key_scale_val, time_sig_val, dur_val
-                def _estimate_duration_from_lyrics(lyrics: str) -> Optional[float]:
-                    lyrics = (lyrics or "").strip()
-                    if not lyrics:
-                        return None
-                    # Best-effort heuristic: singing rate ~ 2.2 words/sec for English-like lyrics.
-                    # For languages without spaces, fall back to non-space char count.
-                    words = re.findall(r"[A-Za-z0-9']+", lyrics)
-                    if len(words) >= 8:
-                        words_per_sec = float(os.getenv("ACESTEP_LYRICS_WORDS_PER_SEC", "2.2"))
-                        est = len(words) / max(0.5, words_per_sec)
-                    else:
-                        non_space = len(re.sub(r"\s+", "", lyrics))
-                        chars_per_sec = float(os.getenv("ACESTEP_LYRICS_CHARS_PER_SEC", "12"))
-                        est = non_space / max(4.0, chars_per_sec)
-                    min_dur = float(os.getenv("ACESTEP_LYRICS_MIN_DURATION_SECONDS", "45"))
-                    max_dur = float(os.getenv("ACESTEP_LYRICS_MAX_DURATION_SECONDS", "180"))
-                    return float(min(max(est, min_dur), max_dur))
-                def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
-                    """Ensure a stable `metas` dict (keys always present)."""
-                    meta = meta or {}
-                    out: Dict[str, Any] = dict(meta)
-                    # Normalize key aliases
-                    if "keyscale" not in out and "key_scale" in out:
-                        out["keyscale"] = out.get("key_scale")
-                    if "timesignature" not in out and "time_signature" in out:
-                        out["timesignature"] = out.get("time_signature")
-                    # Ensure required keys exist
-                    for k in ["bpm", "duration", "genres", "keyscale", "timesignature"]:
-                        if out.get(k) in (None, ""):
-                            out[k] = "N/A"
-                    return out
                 def _ensure_llm_ready() -> None:
                     with app.state._llm_init_lock:
                         initialized = getattr(app.state, "_llm_initialized", False)
                         had_error = getattr(app.state, "_llm_init_error", None)
@@ -597,269 +486,207 @@ def create_app() -> FastAPI:
                         else:
                             app.state._llm_initialized = True
-                # Optional: generate 5Hz LM codes server-side
-                audio_code_string = req.audio_code_string
-                bpm_val = req.bpm
-                key_scale_val = req.key_scale
-                time_sig_val = req.time_signature
-                audio_duration_val = req.audio_duration
-                thinking = bool(getattr(req, "thinking", False))
-                print(
-                    "[api_server] parsed req: "
-                    f"thinking={thinking}, caption_len={len((req.caption or '').strip())}, lyrics_len={len((req.lyrics or '').strip())}, "
-                    f"bpm={req.bpm}, audio_duration={req.audio_duration}, key_scale={req.key_scale!r}, time_signature={req.time_signature!r}"
-                )
-                # If LM-generated code hints are used, a too-strong cover strength can suppress lyric/vocal conditioning.
-                # We keep backward compatibility: only auto-adjust when user didn't override (still at default 1.0).
-                audio_cover_strength_val = float(req.audio_cover_strength)
-                lm_meta: Optional[Dict[str, Any]] = None
-                sample_mode = bool(getattr(req, "sample_mode", False))
-                if sample_mode:
                     _ensure_llm_ready()
                     if getattr(app.state, "_llm_init_error", None):
                         raise RuntimeError(f"5Hz LM init failed: {app.state._llm_init_error}")
                     sample_metadata, sample_status = llm.understand_audio_from_codes(
                         audio_codes="NO USER INPUT",
-                        temperature=float(getattr(req, "lm_temperature", _LM_DEFAULT_TEMPERATURE)),
-                        cfg_scale=max(1.0, float(getattr(req, "lm_cfg_scale", _LM_DEFAULT_CFG_SCALE))),
-                        negative_prompt=str(getattr(req, "lm_negative_prompt", "NO USER INPUT") or "NO USER INPUT"),
-                        top_k=_normalize_optional_int(getattr(req, "lm_top_k", None)),
-                        top_p=_normalize_optional_float(getattr(req, "lm_top_p", None)),
-                        repetition_penalty=float(getattr(req, "lm_repetition_penalty", 1.0)),
-                        use_constrained_decoding=bool(getattr(req, "constrained_decoding", True)),
-                        constrained_decoding_debug=bool(getattr(req, "constrained_decoding_debug", False)),
                     )
                     if not sample_metadata or str(sample_status).startswith("❌"):
                         raise RuntimeError(f"Sample generation failed: {sample_status}")
-                    req.caption = str(sample_metadata.get("caption", "") or "")
-                    req.lyrics = str(sample_metadata.get("lyrics", "") or "")
-                    req.bpm = _to_int(sample_metadata.get("bpm"), req.bpm)
-                    sample_keyscale = sample_metadata.get("keyscale", sample_metadata.get("key_scale", ""))
-                    if sample_keyscale:
-                        req.key_scale = str(sample_keyscale)
-                    sample_timesig = sample_metadata.get("timesignature", sample_metadata.get("time_signature", ""))
-                    if sample_timesig:
-                        req.time_signature = str(sample_timesig)
-                    sample_duration = _to_float(sample_metadata.get("duration"), None)
-                    if sample_duration is not None and sample_duration > 0:
-                        req.audio_duration = sample_duration
-                    lm_meta = sample_metadata
-                    fallback_values: Dict[str, Any] = {}
-                    default_bpm = _to_int(os.getenv("ACESTEP_SAMPLE_DEFAULT_BPM", "120"), 120) or 120
-                    default_duration = _to_float(os.getenv("ACESTEP_SAMPLE_DEFAULT_DURATION_SECONDS", "120"), 120.0) or 120.0
-                    default_key = os.getenv("ACESTEP_SAMPLE_DEFAULT_KEY", "C Major") or "C Major"
-                    default_timesig = os.getenv("ACESTEP_SAMPLE_DEFAULT_TIMESIGNATURE", "4/4") or "4/4"
-                    if req.bpm is None or req.bpm <= 0:
-                        req.bpm = default_bpm
-                        fallback_values["bpm"] = default_bpm
-                    if req.audio_duration is None or req.audio_duration <= 0:
-                        req.audio_duration = default_duration
-                        fallback_values["audio_duration"] = default_duration
-                    if not (req.key_scale or "").strip():
-                        req.key_scale = default_key
-                        fallback_values["key_scale"] = default_key
-                    if not (req.time_signature or "").strip():
-                        req.time_signature = default_timesig
-                        fallback_values["time_signature"] = default_timesig
-                    if fallback_values:
-                        print("[api_server] sample mode fallback values:", fallback_values)
-                    print(
-                        "[api_server] sample mode metadata:",
-                        {
-                            "caption_len": len(req.caption),
-                            "lyrics_len": len(req.lyrics),
-                            "bpm": req.bpm,
-                            "audio_duration": req.audio_duration,
-                            "key_scale": req.key_scale,
-                            "time_signature": req.time_signature,
-                        },
-                    )
-                # Determine effective batch size (used for per-sample LM code diversity)
-                effective_batch_size = req.batch_size
-                if effective_batch_size is None:
-                    try:
-                        effective_batch_size = int(getattr(h, "batch_size", 1))
-                    except Exception:
-                        effective_batch_size = 1
-                effective_batch_size = max(1, int(effective_batch_size))
-                has_codes = bool(audio_code_string and str(audio_code_string).strip())
-                need_lm_codes = bool(thinking) and (not has_codes)
-                use_constrained_decoding = bool(getattr(req, "constrained_decoding", True))
-                constrained_decoding_debug = bool(getattr(req, "constrained_decoding_debug", False))
-                use_cot_caption = bool(getattr(req, "use_cot_caption", True))
-                use_cot_language = bool(getattr(req, "use_cot_language", True))
-                is_format_caption = bool(getattr(req, "is_format_caption", False))
-                # pass them into constrained decoding so LM injects them directly
-                # (i.e. does not re-infer / override those fields).
-                user_metadata: Dict[str, Optional[str]] = {}
-                def _set_user_meta(field: str, value: Optional[Any]) -> None:
-                    if value is None:
-                        return
-                    s = str(value).strip()
-                    if not s or s.upper() == "N/A":
-                        return
-                    user_metadata[field] = s
-                _set_user_meta("bpm", int(bpm_val) if bpm_val is not None else None)
-                _set_user_meta("duration", float(audio_duration_val) if audio_duration_val is not None else None)
-                _set_user_meta("keyscale", key_scale_val if (key_scale_val or "").strip() else None)
-                _set_user_meta("timesignature", time_sig_val if (time_sig_val or "").strip() else None)
-                def _has_meta(field: str) -> bool:
-                    v = user_metadata.get(field)
-                    return bool((v or "").strip())
-                need_lm_metas = not (
-                    _has_meta("bpm")
-                    and _has_meta("duration")
-                    and _has_meta("keyscale")
-                    and _has_meta("timesignature")
                 )
-                lm_target_duration: Optional[float] = None
-                if need_lm_codes:
-                    # If user specified a duration, constrain codes generation length accordingly.
-                    if audio_duration_val is not None and float(audio_duration_val) > 0:
-                        lm_target_duration = float(audio_duration_val)
-                print(
-                    "[api_server] LM调用参数: "
-                    f"user_metadata_keys={sorted(user_metadata.keys())}, target_duration={lm_target_duration}, "
-                    f"need_lm_codes={need_lm_codes}, need_lm_metas={need_lm_metas}, "
-                    f"use_constrained_decoding={use_constrained_decoding}, use_cot_caption={use_cot_caption}, "
-                    f"use_cot_language={use_cot_language}, is_format_caption={is_format_caption}"
                 )
-                if need_lm_metas or need_lm_codes:
-                    _ensure_llm_ready()
-                    if getattr(app.state, "_llm_init_error", None):
-                        # If codes generation is required, fail hard.
-                        if need_lm_codes:
-                            raise RuntimeError(f"5Hz LM init failed: {app.state._llm_init_error}")
-                        # Otherwise, skip LM best-effort (fallback to default/meta-less behavior)
-                    else:
-                        lm_infer = "llm_dit" if need_lm_codes else "dit"
-                        def _lm_call() -> tuple[Dict[str, Any], str, str]:
-                            return llm.generate_with_stop_condition(
-                                caption=req.caption,
-                                lyrics=req.lyrics,
-                                infer_type=lm_infer,
-                                temperature=float(req.lm_temperature),
-                                cfg_scale=max(1.0, float(req.lm_cfg_scale)),
-                                negative_prompt=str(req.lm_negative_prompt or "NO USER INPUT"),
-                                top_k=_normalize_optional_int(req.lm_top_k),
-                                top_p=_normalize_optional_float(req.lm_top_p),
-                                repetition_penalty=float(req.lm_repetition_penalty),
-                                target_duration=lm_target_duration,
-                                user_metadata=(user_metadata or None),
-                                use_constrained_decoding=use_constrained_decoding,
-                                constrained_decoding_debug=constrained_decoding_debug,
-                                use_cot_caption=use_cot_caption,
-                                use_cot_language=use_cot_language,
-                                is_format_caption=is_format_caption,
-                            )
-                        meta, codes, status = _lm_call()
-                        lm_meta = meta
-                        if need_lm_codes:
-                            if not codes:
-                                raise RuntimeError(f"5Hz LM generation failed: {status}")
-                            # LM once per job; rely on DiT seeds for batch diversity.
-                            # For convenience, replicate the same codes across the batch.
-                            if effective_batch_size > 1:
-                                audio_code_string = [codes] * effective_batch_size
-                            else:
-                                audio_code_string = codes
-                        # Fill only missing fields (user-provided values win)
-                        bpm_val, key_scale_val, time_sig_val, audio_duration_val = _maybe_fill_from_metadata(req, meta)
-                        # If user provided lyrics but LM didn't provide a usable duration, estimate a longer duration.
-                        if audio_duration_val is None and (req.audio_duration is None):
-                            est = _estimate_duration_from_lyrics(req.lyrics)
-                            if est is not None:
-                                audio_duration_val = est
-                        # Optional: auto-tune LM cover strength (opt-in) to avoid suppressing lyric/vocal conditioning.
-                        if thinking and audio_cover_strength_val >= 0.999 and (req.lyrics or "").strip():
-                            tuned = os.getenv("ACESTEP_LM_COVER_STRENGTH")
-                            if tuned is not None and tuned.strip() != "":
-                                audio_cover_strength_val = float(tuned)
-                # Align behavior:
-                # - thinking=False: metas only (ignore audio codes), keep text2music.
-                # - thinking=True: metas + audio codes, run in cover mode with LM instruction.
-                instruction_val = req.instruction
-                task_type_val = (req.task_type or "").strip() or "text2music"
-                if not thinking:
-                    audio_code_string = ""
-                    if task_type_val == "cover":
-                        task_type_val = "text2music"
-                    if (instruction_val or "").strip() in {"", _DEFAULT_LM_INSTRUCTION}:
-                        instruction_val = _DEFAULT_DIT_INSTRUCTION
-                if thinking:
-                    task_type_val = "cover"
-                    if (instruction_val or "").strip() in {"", _DEFAULT_DIT_INSTRUCTION}:
-                        instruction_val = _DEFAULT_LM_INSTRUCTION
-                    if not (audio_code_string and str(audio_code_string).strip()):
-                        # thinking=True requires codes generation.
-                        raise RuntimeError("thinking=true requires non-empty audio codes (LM generation failed).")
-                # Response metas MUST reflect the actual values used by DiT.
-                metas_out = _normalize_metas(lm_meta or {})
-                if bpm_val is not None and int(bpm_val) > 0:
-                    metas_out["bpm"] = int(bpm_val)
-                if audio_duration_val is not None and float(audio_duration_val) > 0:
-                    metas_out["duration"] = float(audio_duration_val)
-                if (key_scale_val or "").strip():
-                    metas_out["keyscale"] = str(key_scale_val)
-                if (time_sig_val or "").strip():
-                    metas_out["timesignature"] = str(time_sig_val)
-                def _ensure_text_meta(field: str, fallback: Optional[str]) -> None:
-                    existing = metas_out.get(field)
-                    if isinstance(existing, str):
-                        stripped = existing.strip()
-                        if stripped and stripped.upper() != "N/A":
-                            return
-                    if fallback is None:
-                        return
-                    if fallback.strip():
-                        metas_out[field] = fallback
-                _ensure_text_meta("caption", req.caption)
-                _ensure_text_meta("lyrics", req.lyrics)
                 def _none_if_na_str(v: Any) -> Optional[str]:
                     if v is None:
@@ -868,44 +695,17 @@ def create_app() -> FastAPI:
                     if s in {"", "N/A"}:
                         return None
                     return s
-                first, second, paths, gen_info, status_msg, seed_value, *_ = h.generate_music(
-                    captions=req.caption,
-                    lyrics=req.lyrics,
-                    bpm=bpm_val,
-                    key_scale=key_scale_val,
-                    time_signature=time_sig_val,
-                    vocal_language=req.vocal_language,
-                    inference_steps=req.inference_steps,
-                    guidance_scale=req.guidance_scale,
-                    use_random_seed=req.use_random_seed,
-                    seed=("-1" if (req.use_random_seed and int(req.seed) < 0) else str(req.seed)),
-                    reference_audio=req.reference_audio_path,
-                    audio_duration=audio_duration_val,
-                    batch_size=req.batch_size,
-                    src_audio=req.src_audio_path,
-                    audio_code_string=audio_code_string,
-                    repainting_start=req.repainting_start,
-                    repainting_end=req.repainting_end,
-                    instruction=instruction_val,
-                    audio_cover_strength=audio_cover_strength_val,
-                    task_type=task_type_val,
-                    use_adg=req.use_adg,
-                    cfg_interval_start=req.cfg_interval_start,
-                    cfg_interval_end=req.cfg_interval_end,
-                    audio_format=req.audio_format,
-                    use_tiled_decode=req.use_tiled_decode,
-                    progress=None,
-                )
                 return {
-                    "first_audio_path": _path_to_audio_url(first) if first else None,
-                    "second_audio_path": _path_to_audio_url(second) if second else None,
-                    "audio_paths": [_path_to_audio_url(p) for p in (paths or [])],
-                    "generation_info": gen_info,
-                    "status_message": status_msg,
                     "seed_value": seed_value,
                     "metas": metas_out,
-                    "bpm": int(bpm_val) if bpm_val is not None else None,
-                    "duration": float(audio_duration_val) if audio_duration_val is not None else None,
                     "genres": _none_if_na_str(metas_out.get("genres")),
                     "keyscale": _none_if_na_str(metas_out.get("keyscale")),
                     "timesignature": _none_if_na_str(metas_out.get("timesignature")),
@@ -1010,53 +810,6 @@ def create_app() -> FastAPI:
                 return default
-            # Debug: print what keys we actually received (helps explain empty parsed values)
-            try:
-                top_keys = list(getattr(mapping, "keys", lambda: [])())
-            except Exception:
-                top_keys = []
-            try:
-                nested_probe = (
-                    get("metas", None)
-                    or get("meta", None)
-                    or get("metadata", None)
-                    or get("user_metadata", None)
-                    or get("userMetadata", None)
-                )
-                if isinstance(nested_probe, str):
-                    sp = nested_probe.strip()
-                    if sp.startswith("{") and sp.endswith("}"):
-                        try:
-                            nested_probe = json.loads(sp)
-                        except Exception:
-                            nested_probe = None
-                nested_keys = list(nested_probe.keys()) if isinstance(nested_probe, dict) else []
-            except Exception:
-                nested_keys = []
-            print(f"[api_server] request keys: top={sorted(top_keys)}, nested={sorted(nested_keys)}")
-            # Debug: print raw values/types for common meta fields (top-level + common aliases)
-            try:
-                probe_keys = [
-                    "thinking",
-                    "bpm",
-                    "audio_duration",
-                    "duration",
-                    "audioDuration",
-                    "key_scale",
-                    "keyscale",
-                    "keyScale",
-                    "time_signature",
-                    "timesignature",
-                    "timeSignature",
-                ]
-                raw = {k: get(k, None) for k in probe_keys}
-                raw_types = {k: (type(v).__name__ if v is not None else None) for k, v in raw.items()}
-                print(f"[api_server] request raw: {raw}")
-                print(f"[api_server] request raw types: {raw_types}")
-            except Exception:
-                pass
             normalized_audio_duration = _to_float(_get_any("audio_duration", "duration", "audioDuration"), None)
             normalized_bpm = _to_int(_get_any("bpm"), None)
             normalized_keyscale = str(_get_any("key_scale", "keyscale", "keyScale", default="") or "")
@@ -1066,12 +819,6 @@ def create_app() -> FastAPI:
             if normalized_audio_duration is None:
                 normalized_audio_duration = _to_float(_get_any("target_duration", "targetDuration"), None)
-            print(
-                "[api_server] normalized: "
-                f"thinking={_to_bool(get('thinking'), False)}, bpm={normalized_bpm}, "
-                f"audio_duration={normalized_audio_duration}, key_scale={normalized_keyscale!r}, time_signature={normalized_timesig!r}"
-            )
             return GenerateMusicRequest(
                 caption=str(get("caption", "") or ""),
                 lyrics=str(get("lyrics", "") or ""),
@@ -1110,7 +857,6 @@ def create_app() -> FastAPI:
                 lm_negative_prompt=str(get("lm_negative_prompt", "NO USER INPUT") or "NO USER INPUT"),
                 constrained_decoding=_to_bool(_get_any("constrained_decoding", "constrainedDecoding", "constrained"), True),
                 constrained_decoding_debug=_to_bool(_get_any("constrained_decoding_debug", "constrainedDecodingDebug"), False),
-                # Accept common aliases, including hyphenated keys from some clients.
                 use_cot_caption=_to_bool(_get_any("use_cot_caption", "cot_caption", "cot-caption"), True),
                 use_cot_language=_to_bool(_get_any("use_cot_language", "cot_language", "cot-language"), True),
                 is_format_caption=_to_bool(_get_any("is_format_caption", "isFormatCaption"), False),

     DEFAULT_DIT_INSTRUCTION,
     DEFAULT_LM_INSTRUCTION,
 )
+from acestep.inference import (
+    GenerationParams,
+    GenerationConfig,
+    generate_music,
+)
+from acestep.gradio_ui.events.results_handlers import _build_generation_info
 JobStatus = Literal["queued", "running", "succeeded", "failed"]
         app.state.executor = executor
         app.state.job_store = store
         app.state._python_executable = sys.executable
+        # Temporary directory for saving generated audio files
+        app.state.temp_audio_dir = os.path.join(tmp_root, "api_audio")
+        os.makedirs(app.state.temp_audio_dir, exist_ok=True)
         async def _ensure_initialized() -> None:
             h: AceStepHandler = app.state.handler
             job_store.mark_running(job_id)
             def _blocking_generate() -> Dict[str, Any]:
+                """Generate music using unified inference logic from acestep.inference"""
                 def _ensure_llm_ready() -> None:
+                    """Ensure LLM handler is initialized when needed"""
                     with app.state._llm_init_lock:
                         initialized = getattr(app.state, "_llm_initialized", False)
                         had_error = getattr(app.state, "_llm_init_error", None)
                         else:
                             app.state._llm_initialized = True
+                def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
+                    """Ensure a stable `metas` dict (keys always present)."""
+                    meta = meta or {}
+                    out: Dict[str, Any] = dict(meta)
+                    # Normalize key aliases
+                    if "keyscale" not in out and "key_scale" in out:
+                        out["keyscale"] = out.get("key_scale")
+                    if "timesignature" not in out and "time_signature" in out:
+                        out["timesignature"] = out.get("time_signature")
+                    # Ensure required keys exist
+                    for k in ["bpm", "duration", "genres", "keyscale", "timesignature"]:
+                        if out.get(k) in (None, ""):
+                            out[k] = "N/A"
+                    return out
+                # Normalize LM sampling parameters
+                lm_top_k = req.lm_top_k if req.lm_top_k and req.lm_top_k > 0 else 0
+                lm_top_p = req.lm_top_p if req.lm_top_p and req.lm_top_p < 1.0 else 0.9
+                # Determine if LLM is needed
+                thinking = bool(req.thinking)
+                sample_mode = bool(req.sample_mode)
+                need_llm = thinking or sample_mode
+                print(f"[api_server] Request params: req.thinking={req.thinking}, req.sample_mode={req.sample_mode}")
+                print(f"[api_server] Determined: thinking={thinking}, sample_mode={sample_mode}, need_llm={need_llm}")
+                # Ensure LLM is ready if needed
+                if need_llm:
                     _ensure_llm_ready()
                     if getattr(app.state, "_llm_init_error", None):
                         raise RuntimeError(f"5Hz LM init failed: {app.state._llm_init_error}")
+                # Handle sample mode: generate random caption/lyrics first
+                caption = req.caption
+                lyrics = req.lyrics
+                bpm = req.bpm
+                key_scale = req.key_scale
+                time_signature = req.time_signature
+                audio_duration = req.audio_duration
+                if sample_mode:
+                    print("[api_server] Sample mode: generating random caption/lyrics via LM")
                     sample_metadata, sample_status = llm.understand_audio_from_codes(
                         audio_codes="NO USER INPUT",
+                        temperature=req.lm_temperature,
+                        cfg_scale=max(1.0, req.lm_cfg_scale),
+                        negative_prompt=req.lm_negative_prompt,
+                        top_k=lm_top_k if lm_top_k > 0 else None,
+                        top_p=lm_top_p if lm_top_p < 1.0 else None,
+                        repetition_penalty=req.lm_repetition_penalty,
+                        use_constrained_decoding=req.constrained_decoding,
+                        constrained_decoding_debug=req.constrained_decoding_debug,
                     )
                     if not sample_metadata or str(sample_status).startswith("❌"):
                         raise RuntimeError(f"Sample generation failed: {sample_status}")
+                    # Use generated values with fallback defaults
+                    caption = sample_metadata.get("caption", "")
+                    lyrics = sample_metadata.get("lyrics", "")
+                    bpm = _to_int(sample_metadata.get("bpm"), None) or _to_int(os.getenv("ACESTEP_SAMPLE_DEFAULT_BPM", "120"), 120)
+                    key_scale = sample_metadata.get("keyscale", "") or os.getenv("ACESTEP_SAMPLE_DEFAULT_KEY", "C Major")
+                    time_signature = sample_metadata.get("timesignature", "") or os.getenv("ACESTEP_SAMPLE_DEFAULT_TIMESIGNATURE", "4/4")
+                    audio_duration = _to_float(sample_metadata.get("duration"), None) or _to_float(os.getenv("ACESTEP_SAMPLE_DEFAULT_DURATION_SECONDS", "120"), 120.0)
+                    print(f"[api_server] Sample generated: caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}")
+                print(f"[api_server] Before GenerationParams: thinking={thinking}, sample_mode={sample_mode}")
+                print(f"[api_server] Caption/Lyrics to use: caption_len={len(caption)}, lyrics_len={len(lyrics)}")
+                # Build GenerationParams using unified interface
+                # Note: thinking controls LM code generation, sample_mode only affects CoT metas
+                params = GenerationParams(
+                    task_type=req.task_type,
+                    instruction=req.instruction,
+                    reference_audio=req.reference_audio_path,
+                    src_audio=req.src_audio_path,
+                    audio_codes=req.audio_code_string,
+                    caption=caption,
+                    lyrics=lyrics,
+                    instrumental=False,
+                    vocal_language=req.vocal_language,
+                    bpm=bpm,
+                    keyscale=key_scale,
+                    timesignature=time_signature,
+                    duration=audio_duration if audio_duration else -1.0,
+                    inference_steps=req.inference_steps,
+                    seed=req.seed,
+                    guidance_scale=req.guidance_scale,
+                    use_adg=req.use_adg,
+                    cfg_interval_start=req.cfg_interval_start,
+                    cfg_interval_end=req.cfg_interval_end,
+                    repainting_start=req.repainting_start,
+                    repainting_end=req.repainting_end if req.repainting_end else -1,
+                    audio_cover_strength=req.audio_cover_strength,
+                    # LM parameters
+                    thinking=thinking,  # Use LM for code generation when thinking=True
+                    lm_temperature=req.lm_temperature,
+                    lm_cfg_scale=req.lm_cfg_scale,
+                    lm_top_k=lm_top_k,
+                    lm_top_p=lm_top_p,
+                    lm_negative_prompt=req.lm_negative_prompt,
+                    use_cot_metas=not sample_mode,  # Sample mode already generated metas, don't regenerate
+                    use_cot_caption=req.use_cot_caption,
+                    use_cot_language=req.use_cot_language,
+                    use_constrained_decoding=req.constrained_decoding,
                 )
+                # Build GenerationConfig - default to 2 audios like gradio_ui
+                batch_size = req.batch_size if req.batch_size is not None else 2
+                config = GenerationConfig(
+                    batch_size=batch_size,
+                    use_random_seed=req.use_random_seed,
+                    seeds=None,  # Let unified logic handle seed generation
+                    audio_format=req.audio_format,
+                    constrained_decoding_debug=req.constrained_decoding_debug,
                 )
+                # Check LLM initialization status
+                llm_is_initialized = getattr(app.state, "_llm_initialized", False)
+                llm_to_pass = llm if llm_is_initialized else None
+                print(f"[api_server] Generating music with unified interface:")
+                print(f"  - thinking={params.thinking}")
+                print(f"  - batch_size={batch_size}")
+                print(f"  - llm_initialized={llm_is_initialized}")
+                print(f"  - llm_handler={'Available' if llm_to_pass else 'None'}")
+                # Generate music using unified interface
+                result = generate_music(
+                    dit_handler=h,
+                    llm_handler=llm_to_pass,
+                    params=params,
+                    config=config,
+                    save_dir=app.state.temp_audio_dir,
+                    progress=None,
+                )
+                print(f"[api_server] Generation completed. Success={result.success}, Audios={len(result.audios)}")
+                print(f"[api_server] Time costs keys: {list(result.extra_outputs.get('time_costs', {}).keys())}")
+                if not result.success:
+                    raise RuntimeError(f"Music generation failed: {result.error or result.status_message}")
+                # Extract results
+                audio_paths = [audio["path"] for audio in result.audios if audio.get("path")]
+                first_audio = audio_paths[0] if len(audio_paths) > 0 else None
+                second_audio = audio_paths[1] if len(audio_paths) > 1 else None
+                # Get metadata from LM or CoT results
+                lm_metadata = result.extra_outputs.get("lm_metadata", {})
+                metas_out = _normalize_metas(lm_metadata)
+                # Update metas with actual values used
+                if params.cot_bpm:
+                    metas_out["bpm"] = params.cot_bpm
+                elif bpm:
+                    metas_out["bpm"] = bpm
+                if params.cot_duration:
+                    metas_out["duration"] = params.cot_duration
+                elif audio_duration:
+                    metas_out["duration"] = audio_duration
+                if params.cot_keyscale:
+                    metas_out["keyscale"] = params.cot_keyscale
+                elif key_scale:
+                    metas_out["keyscale"] = key_scale
+                if params.cot_timesignature:
+                    metas_out["timesignature"] = params.cot_timesignature
+                elif time_signature:
+                    metas_out["timesignature"] = time_signature
+                # Ensure caption and lyrics are in metas
+                if caption:
+                    metas_out["caption"] = caption
+                if lyrics:
+                    metas_out["lyrics"] = lyrics
+                # Extract seed values for response (comma-separated for multiple audios)
+                seed_values = []
+                for audio in result.audios:
+                    audio_params = audio.get("params", {})
+                    seed = audio_params.get("seed")
+                    if seed is not None:
+                        seed_values.append(str(seed))
+                seed_value = ",".join(seed_values) if seed_values else ""
+                # Build generation_info using the helper function (like gradio_ui)
+                time_costs = result.extra_outputs.get("time_costs", {})
+                generation_info = _build_generation_info(
+                    lm_metadata=lm_metadata,
+                    time_costs=time_costs,
+                    seed_value=seed_value,
+                    inference_steps=req.inference_steps,
+                    num_audios=len(result.audios),
+                )
                 def _none_if_na_str(v: Any) -> Optional[str]:
                     if v is None:
                     if s in {"", "N/A"}:
                         return None
                     return s
                 return {
+                    "first_audio_path": _path_to_audio_url(first_audio) if first_audio else None,
+                    "second_audio_path": _path_to_audio_url(second_audio) if second_audio else None,
+                    "audio_paths": [_path_to_audio_url(p) for p in audio_paths],
+                    "generation_info": generation_info,
+                    "status_message": result.status_message,
                     "seed_value": seed_value,
                     "metas": metas_out,
+                    "bpm": metas_out.get("bpm") if isinstance(metas_out.get("bpm"), int) else None,
+                    "duration": metas_out.get("duration") if isinstance(metas_out.get("duration"), (int, float)) else None,
                     "genres": _none_if_na_str(metas_out.get("genres")),
                     "keyscale": _none_if_na_str(metas_out.get("keyscale")),
                     "timesignature": _none_if_na_str(metas_out.get("timesignature")),
                 return default
             normalized_audio_duration = _to_float(_get_any("audio_duration", "duration", "audioDuration"), None)
             normalized_bpm = _to_int(_get_any("bpm"), None)
             normalized_keyscale = str(_get_any("key_scale", "keyscale", "keyScale", default="") or "")
             if normalized_audio_duration is None:
                 normalized_audio_duration = _to_float(_get_any("target_duration", "targetDuration"), None)
             return GenerateMusicRequest(
                 caption=str(get("caption", "") or ""),
                 lyrics=str(get("lyrics", "") or ""),
                 lm_negative_prompt=str(get("lm_negative_prompt", "NO USER INPUT") or "NO USER INPUT"),
                 constrained_decoding=_to_bool(_get_any("constrained_decoding", "constrainedDecoding", "constrained"), True),
                 constrained_decoding_debug=_to_bool(_get_any("constrained_decoding_debug", "constrainedDecodingDebug"), False),
                 use_cot_caption=_to_bool(_get_any("use_cot_caption", "cot_caption", "cot-caption"), True),
                 use_cot_language=_to_bool(_get_any("use_cot_language", "cot_language", "cot-language"), True),
                 is_format_caption=_to_bool(_get_any("is_format_caption", "isFormatCaption"), False),

acestep/audio_utils.py ADDED Viewed

	@@ -0,0 +1,320 @@

+"""
+Audio saving and transcoding utility module
+Independent audio file operations outside of handler, supporting:
+- Save audio tensor/numpy to files (default FLAC format, fast)
+- Format conversion (FLAC/WAV/MP3)
+- Batch processing
+"""
+import os
+import hashlib
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple
+import torch
+import numpy as np
+import torchaudio
+from loguru import logger
+class AudioSaver:
+    """Audio saving and transcoding utility class"""
+    def __init__(self, default_format: str = "flac"):
+        """
+        Initialize audio saver
+        Args:
+            default_format: Default save format ('flac', 'wav', 'mp3')
+        """
+        self.default_format = default_format.lower()
+        if self.default_format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {default_format}, using 'flac'")
+            self.default_format = "flac"
+    def save_audio(
+        self,
+        audio_data: Union[torch.Tensor, np.ndarray],
+        output_path: Union[str, Path],
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> str:
+        """
+        Save audio data to file
+        Args:
+            audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray
+            output_path: Output file path (extension can be omitted)
+            sample_rate: Sample rate
+            format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format
+            channels_first: If True, tensor format is [channels, samples], else [samples, channels]
+        Returns:
+            Actual saved file path
+        """
+        format = (format or self.default_format).lower()
+        if format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {format}, using {self.default_format}")
+            format = self.default_format
+        # Ensure output path has correct extension
+        output_path = Path(output_path)
+        if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']:
+            output_path = output_path.with_suffix(f'.{format}')
+        # Convert to torch tensor
+        if isinstance(audio_data, np.ndarray):
+            if channels_first:
+                # numpy [samples, channels] -> tensor [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data.T).float()
+            else:
+                # numpy [samples, channels] -> tensor [samples, channels] -> [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data).float()
+                if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        else:
+            # torch tensor
+            audio_tensor = audio_data.cpu().float()
+            if not channels_first and audio_tensor.dim() == 2:
+                # [samples, channels] -> [channels, samples]
+                if audio_tensor.shape[0] > audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        # Ensure memory is contiguous
+        audio_tensor = audio_tensor.contiguous()
+        # Select backend and save
+        try:
+            if format == "mp3":
+                # MP3 uses ffmpeg backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='ffmpeg',
+                )
+            elif format in ["flac", "wav"]:
+                # FLAC and WAV use soundfile backend (fastest)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='soundfile',
+                )
+            else:
+                # Other formats use default backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                )
+            logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+            return str(output_path)
+        except Exception as e:
+            logger.error(f"[AudioSaver] Failed to save audio: {e}")
+            raise
+    def convert_audio(
+        self,
+        input_path: Union[str, Path],
+        output_path: Union[str, Path],
+        output_format: str,
+        remove_input: bool = False,
+    ) -> str:
+        """
+        Convert audio format
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path
+            output_format: Target format ('flac', 'wav', 'mp3')
+            remove_input: Whether to delete input file
+        Returns:
+            Output file path
+        """
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # Load audio
+        audio_tensor, sample_rate = torchaudio.load(str(input_path))
+        # Save as new format
+        output_path = self.save_audio(
+            audio_tensor,
+            output_path,
+            sample_rate=sample_rate,
+            format=output_format,
+            channels_first=True
+        )
+        # Delete input file if needed
+        if remove_input:
+            input_path.unlink()
+            logger.debug(f"[AudioSaver] Removed input file: {input_path}")
+        return output_path
+    def save_batch(
+        self,
+        audio_batch: Union[List[torch.Tensor], torch.Tensor],
+        output_dir: Union[str, Path],
+        file_prefix: str = "audio",
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> List[str]:
+        """
+        Save audio batch
+        Args:
+            audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples]
+            output_dir: Output directory
+            file_prefix: File prefix
+            sample_rate: Sample rate
+            format: Audio format
+            channels_first: Tensor format flag
+        Returns:
+            List of saved file paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Process batch
+        if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3:
+            # [batch, channels, samples]
+            audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])]
+        elif isinstance(audio_batch, list):
+            audio_list = audio_batch
+        else:
+            audio_list = [audio_batch]
+        saved_paths = []
+        for i, audio in enumerate(audio_list):
+            output_path = output_dir / f"{file_prefix}_{i:04d}"
+            saved_path = self.save_audio(
+                audio,
+                output_path,
+                sample_rate=sample_rate,
+                format=format,
+                channels_first=channels_first
+            )
+            saved_paths.append(saved_path)
+        return saved_paths
+def get_audio_file_hash(audio_file) -> str:
+    """
+    Get hash identifier for an audio file.
+    Args:
+        audio_file: Path to audio file (str) or file-like object
+    Returns:
+        Hash string or empty string
+    """
+    if audio_file is None:
+        return ""
+    try:
+        if isinstance(audio_file, str):
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as f:
+                    return hashlib.md5(f.read()).hexdigest()
+            return hashlib.md5(audio_file.encode('utf-8')).hexdigest()
+        elif hasattr(audio_file, 'name'):
+            return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest()
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+    except Exception:
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(params_dict) -> str:
+    """
+    Generate deterministic UUID from generation parameters.
+    Same parameters will always generate the same UUID.
+    Args:
+        params_dict: Dictionary of parameters
+    Returns:
+        UUID string
+    """
+    params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
+    hash_obj = hashlib.sha256(params_json.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()
+    uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
+    return uuid_str
+def generate_uuid_from_audio_data(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    seed: Optional[int] = None
+) -> str:
+    """
+    Generate UUID from audio data (for caching/deduplication)
+    Args:
+        audio_data: Audio data
+        seed: Optional seed value
+    Returns:
+        UUID string
+    """
+    if isinstance(audio_data, torch.Tensor):
+        # Convert to numpy and calculate hash
+        audio_np = audio_data.cpu().numpy()
+    else:
+        audio_np = audio_data
+    # Calculate data hash
+    data_hash = hashlib.md5(audio_np.tobytes()).hexdigest()
+    if seed is not None:
+        combined = f"{data_hash}_{seed}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    return data_hash
+# Global default instance
+_default_saver = AudioSaver(default_format="flac")
+def save_audio(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    output_path: Union[str, Path],
+    sample_rate: int = 48000,
+    format: Optional[str] = None,
+    channels_first: bool = True,
+) -> str:
+    """
+    Convenience function: save audio (using default configuration)
+    Args:
+        audio_data: Audio data
+        output_path: Output path
+        sample_rate: Sample rate
+        format: Format (default flac)
+        channels_first: Tensor format flag
+    Returns:
+        Saved file path
+    """
+    return _default_saver.save_audio(
+        audio_data, output_path, sample_rate, format, channels_first
+    )

acestep/constrained_logits_processor.py CHANGED Viewed

@@ -571,6 +571,33 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
@@ -1484,10 +1511,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, blocking EOS")
                 else:
-                    # Force EOS token when target codes count is reached
-                    mask = torch.full_like(scores, float('-inf'))
-                    mask[:, self.eos_token_id] = 0
-                    scores = scores + mask
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, forcing EOS")
             return self._apply_temperature_scaling(scores)
@@ -1609,20 +1636,15 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         input_ids: torch.LongTensor,
         scores: torch.FloatTensor,
     ) -> torch.FloatTensor:
-        """Process a single sequence and return modified scores."""
         # Check if we have tokens in queue for user-provided field
         # If so, inject the next token directly
         if self.user_field_token_queue:
-            mask = torch.full_like(scores, float('-inf'))
             next_token = self.user_field_token_queue[0]
-            mask[0, next_token] = 0
-            scores = scores + mask
             return scores
-        # Create mask (all -inf initially)
-        mask = torch.full_like(scores, float('-inf'))
         if self.state in self.fixed_strings:
             # Fixed string state: force specific tokens
             fixed_str = self.fixed_strings[self.state]
@@ -1633,28 +1655,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 # This happens when we're about to complete the </think> tag
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Check if the next token would complete the fixed string
-                    # We check if position_in_state + length of next token would complete it
-                    # Since we don't know which token will be selected, we check if we're close to completion
-                    # Actually, a better approach: check if this is the last character(s) of the fixed string
                     remaining_chars = len(fixed_str) - self.position_in_state
                     # If remaining is small (<= 10 chars, which is typically 1-2 tokens), force EOS
                     if remaining_chars <= 10:
                         # Force EOS token to stop generation
                         if self.eos_token_id is not None:
-                            mask[0, self.eos_token_id] = 0
-                            scores = scores + mask
                             if self.debug:
                                 logger.debug(f"stop_at_reasoning=True: forcing EOS near end of </think> tag (remaining: {remaining_chars} chars)")
                             return scores
-                for t in allowed:
-                    mask[0, t] = 0
-                # Apply mask
-                scores = scores + mask
-                # Update position tracking
-                # We need to check if the selected token completes the fixed string
-                # This will be done in update_state() after token selection
             else:
                 # Position exceeds string, move to next state
                 # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
@@ -1662,8 +1674,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Force EOS token to stop generation
                     if self.eos_token_id is not None:
-                        mask[0, self.eos_token_id] = 0
-                        scores = scores + mask
                         if self.debug:
                             logger.debug(f"stop_at_reasoning=True: forcing EOS after completing </think> tag")
                         return scores
@@ -1676,7 +1687,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     if self.debug:
                         logger.warning(f"State transition from {old_state.name} to {self.state.name} still in fixed_strings, avoiding recursion")
                     return scores
-                return self._process_single_sequence(input_ids, torch.zeros_like(scores))
         elif self.state == FSMState.BPM_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1690,22 +1703,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "bpm"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "120")
             allowed = self._get_allowed_numeric_tokens(self.bpm_prefix_tree)
-            for t in allowed:
-                mask[0, t] = 0
             # Also allow newline if current token sequence prefix allows it
-            # Check if current token sequence is in prefix tree and allows newline
             token_prefix = tuple(self.accumulated_token_ids)
             if token_prefix in self.bpm_prefix_tree and self.newline_token in self.bpm_prefix_tree[token_prefix]:
-                mask[0, self.newline_token] = 0
-            scores = scores + mask
         elif self.state == FSMState.CAPTION_VALUE:
             # Caption field generation with YAML format support:
@@ -1724,8 +1733,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "caption"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if we should transition after a newline (non-indented line = new field)
@@ -1757,7 +1765,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 # The field name detection will happen in update_state()
                 return scores
-            # Block backticks (code blocks)
             if self.backtick_token is not None:
                 scores[0, self.backtick_token] = float('-inf')
@@ -1773,8 +1781,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if self.caption_token_count >= 512:
                 # Force end by only allowing newline
                 if self.newline_token is not None:
-                    mask[0, self.newline_token] = 0
-                    scores = scores + mask
                     return scores
             # Allow natural generation (with blocked audio codes and backticks)
@@ -1791,8 +1798,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "duration"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # If target_duration is set, force generate that exact value
@@ -1804,26 +1810,22 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     # Force the next digit
                     next_digit = int(target_str[current_pos])
                     if next_digit in self.digit_tokens:
-                        mask[0, self.digit_tokens[next_digit]] = 0
                 else:
                     # All digits generated, force newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Normal duration generation with range constraint
                 # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "60", "120")
                 allowed = self._get_allowed_numeric_tokens(self.duration_prefix_tree)
-                for t in allowed:
-                    mask[0, t] = 0
                 # Also allow newline if current token sequence prefix allows it
                 token_prefix = tuple(self.accumulated_token_ids)
                 if token_prefix in self.duration_prefix_tree and self.newline_token in self.duration_prefix_tree[token_prefix]:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
         elif self.state == FSMState.GENRES_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1836,8 +1838,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "genres"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Try to hot-reload genres vocab if file has changed
@@ -1848,24 +1849,20 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if allowed:
                 # Use vocabulary-constrained decoding
-                for t in allowed:
-                    mask[0, t] = 0
-                scores = scores + mask
             elif self.genres_vocab:
                 # Vocab is loaded but no valid continuation found
                 # Force newline to end the field
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
                     if self.debug:
                         logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
-                scores = scores + mask
             else:
                 # Fallback: no vocab loaded, use probability-based ending
                 if self._should_end_text_field(scores):
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
                         self._transition_to_next_state()
-                    scores = scores + mask
                 else:
                     # Allow any token except newline if we don't have content yet
                     if not self.accumulated_value.strip():
@@ -1884,8 +1881,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "keyscale"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if current token sequence is complete (allows newline)
@@ -1893,21 +1889,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if token_prefix in self.keyscale_prefix_tree and self.newline_token in self.keyscale_prefix_tree[token_prefix]:
                 # Complete keyscale, allow newline
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_keyscale_tokens()
                 if allowed:
-                    for t in allowed:
-                        mask[0, t] = 0
-                    scores = scores + mask
                 else:
                     # No valid tokens found - force newline to end field
                     # This handles edge cases where keyscale format is unexpected
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
         elif self.state == FSMState.LANGUAGE_VALUE:
             # Language field: Use top-1 probability language (greedy selection)
@@ -1925,8 +1917,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "language"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
@@ -1938,19 +1929,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     candidate_tokens = list(self.language_prefix_tree[empty_prefix])
                     if candidate_tokens:
-                        # Find the token with highest probability (top-1)
-                        # Create a mask that blocks all tokens except candidates
-                        temp_mask = torch.full_like(scores, float('-inf'))
-                        for t in candidate_tokens:
-                            temp_mask[0, t] = 0
-                        temp_scores = scores + temp_mask
                         # Get the highest probability token among candidates
-                        top_token_id = torch.argmax(temp_scores[0]).item()
-                        # Only allow this top-1 token, block all others (including other language tokens)
-                        mask[0, top_token_id] = 0
-                        scores = scores + mask
                         if self.debug:
                             top_token_text = self.tokenizer.decode([top_token_id])
@@ -1958,13 +1947,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     else:
                         # No valid first tokens found - force newline
                         if self.newline_token:
-                            mask[0, self.newline_token] = 0
-                        scores = scores + mask
                 else:
                     # Empty prefix not in tree - force newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
             else:
                 # We've started generating a language, continue with prefix tree constraints
                 # Check if current token sequence is complete (allows newline)
@@ -1972,20 +1959,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
                     # Complete language, allow newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
                 else:
                     # Not complete, allow valid continuation tokens
                     allowed = self._get_allowed_language_tokens()
                     if allowed:
-                        for t in allowed:
-                            mask[0, t] = 0
-                        scores = scores + mask
                     else:
                         # No valid tokens found - force newline to end field
                         if self.newline_token:
-                            mask[0, self.newline_token] = 0
-                        scores = scores + mask
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1998,8 +1981,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "timesignature"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if current token sequence is complete (allows newline)
@@ -2007,14 +1989,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if token_prefix in self.timesig_prefix_tree and self.newline_token in self.timesig_prefix_tree[token_prefix]:
                 # Complete value, allow newline
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_timesig_tokens()
-                for t in allowed:
-                    mask[0, t] = 0
-                scores = scores + mask
         return scores

         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
+    def _apply_whitelist_inplace(self, scores: torch.Tensor, allowed_tokens: List[int]) -> None:
+        """
+        Apply whitelist constraint inplace: only allow specified tokens, block all others.
+        This is more efficient than creating a mask tensor because:
+        1. No memory allocation for mask
+        2. No tensor addition operation
+        Args:
+            scores: [1, vocab_size] scores tensor to modify inplace
+            allowed_tokens: List of token IDs to allow (all others will be set to -inf)
+        """
+        if not allowed_tokens:
+            # No tokens allowed, set all to -inf
+            scores.fill_(float('-inf'))
+            return
+        # Save the original values of allowed tokens
+        allowed_indices = torch.tensor(allowed_tokens, device=scores.device, dtype=torch.long)
+        saved_values = scores[0, allowed_indices].clone()
+        # Set all scores to -inf
+        scores.fill_(float('-inf'))
+        # Restore allowed token values
+        scores[0, allowed_indices] = saved_values
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, blocking EOS")
                 else:
+                    # Force EOS token when target codes count is reached - inplace
+                    eos_scores = scores[:, self.eos_token_id].clone()
+                    scores.fill_(float('-inf'))
+                    scores[:, self.eos_token_id] = eos_scores
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, forcing EOS")
             return self._apply_temperature_scaling(scores)
         input_ids: torch.LongTensor,
         scores: torch.FloatTensor,
     ) -> torch.FloatTensor:
+        """Process a single sequence and return modified scores (inplace when possible)."""
         # Check if we have tokens in queue for user-provided field
         # If so, inject the next token directly
         if self.user_field_token_queue:
             next_token = self.user_field_token_queue[0]
+            self._apply_whitelist_inplace(scores, [next_token])
             return scores
         if self.state in self.fixed_strings:
             # Fixed string state: force specific tokens
             fixed_str = self.fixed_strings[self.state]
                 # This happens when we're about to complete the </think> tag
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Check if the next token would complete the fixed string
                     remaining_chars = len(fixed_str) - self.position_in_state
                     # If remaining is small (<= 10 chars, which is typically 1-2 tokens), force EOS
                     if remaining_chars <= 10:
                         # Force EOS token to stop generation
                         if self.eos_token_id is not None:
+                            self._apply_whitelist_inplace(scores, [self.eos_token_id])
                             if self.debug:
                                 logger.debug(f"stop_at_reasoning=True: forcing EOS near end of </think> tag (remaining: {remaining_chars} chars)")
                             return scores
+                # Apply whitelist constraint inplace
+                self._apply_whitelist_inplace(scores, allowed)
             else:
                 # Position exceeds string, move to next state
                 # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Force EOS token to stop generation
                     if self.eos_token_id is not None:
+                        self._apply_whitelist_inplace(scores, [self.eos_token_id])
                         if self.debug:
                             logger.debug(f"stop_at_reasoning=True: forcing EOS after completing </think> tag")
                         return scores
                     if self.debug:
                         logger.warning(f"State transition from {old_state.name} to {self.state.name} still in fixed_strings, avoiding recursion")
                     return scores
+                # For recursion, reset scores to zero (no constraints from previous state)
+                scores.zero_()
+                return self._process_single_sequence(input_ids, scores)
         elif self.state == FSMState.BPM_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "bpm"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "120")
             allowed = self._get_allowed_numeric_tokens(self.bpm_prefix_tree)
             # Also allow newline if current token sequence prefix allows it
             token_prefix = tuple(self.accumulated_token_ids)
             if token_prefix in self.bpm_prefix_tree and self.newline_token in self.bpm_prefix_tree[token_prefix]:
+                allowed = allowed + [self.newline_token]
+            self._apply_whitelist_inplace(scores, allowed)
         elif self.state == FSMState.CAPTION_VALUE:
             # Caption field generation with YAML format support:
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "caption"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if we should transition after a newline (non-indented line = new field)
                 # The field name detection will happen in update_state()
                 return scores
+            # Block backticks (code blocks) - inplace
             if self.backtick_token is not None:
                 scores[0, self.backtick_token] = float('-inf')
             if self.caption_token_count >= 512:
                 # Force end by only allowing newline
                 if self.newline_token is not None:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
                     return scores
             # Allow natural generation (with blocked audio codes and backticks)
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "duration"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # If target_duration is set, force generate that exact value
                     # Force the next digit
                     next_digit = int(target_str[current_pos])
                     if next_digit in self.digit_tokens:
+                        self._apply_whitelist_inplace(scores, [self.digit_tokens[next_digit]])
                 else:
                     # All digits generated, force newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Normal duration generation with range constraint
                 # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "60", "120")
                 allowed = self._get_allowed_numeric_tokens(self.duration_prefix_tree)
                 # Also allow newline if current token sequence prefix allows it
                 token_prefix = tuple(self.accumulated_token_ids)
                 if token_prefix in self.duration_prefix_tree and self.newline_token in self.duration_prefix_tree[token_prefix]:
+                    allowed = allowed + [self.newline_token]
+                self._apply_whitelist_inplace(scores, allowed)
         elif self.state == FSMState.GENRES_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "genres"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Try to hot-reload genres vocab if file has changed
             if allowed:
                 # Use vocabulary-constrained decoding
+                self._apply_whitelist_inplace(scores, allowed)
             elif self.genres_vocab:
                 # Vocab is loaded but no valid continuation found
                 # Force newline to end the field
                 if self.newline_token:
                     if self.debug:
                         logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Fallback: no vocab loaded, use probability-based ending
                 if self._should_end_text_field(scores):
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
                         self._transition_to_next_state()
                 else:
                     # Allow any token except newline if we don't have content yet
                     if not self.accumulated_value.strip():
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "keyscale"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if current token sequence is complete (allows newline)
             if token_prefix in self.keyscale_prefix_tree and self.newline_token in self.keyscale_prefix_tree[token_prefix]:
                 # Complete keyscale, allow newline
                 if self.newline_token:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_keyscale_tokens()
                 if allowed:
+                    self._apply_whitelist_inplace(scores, allowed)
                 else:
                     # No valid tokens found - force newline to end field
                     # This handles edge cases where keyscale format is unexpected
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
         elif self.state == FSMState.LANGUAGE_VALUE:
             # Language field: Use top-1 probability language (greedy selection)
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "language"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
                     candidate_tokens = list(self.language_prefix_tree[empty_prefix])
                     if candidate_tokens:
+                        # Find the token with highest probability (top-1) among candidates
+                        # Use tensor indexing to get scores of candidate tokens directly
+                        candidate_indices = torch.tensor(candidate_tokens, device=scores.device, dtype=torch.long)
+                        candidate_scores = scores[0, candidate_indices]
                         # Get the highest probability token among candidates
+                        best_idx = torch.argmax(candidate_scores).item()
+                        top_token_id = candidate_tokens[best_idx]
+                        # Only allow this top-1 token, block all others
+                        self._apply_whitelist_inplace(scores, [top_token_id])
                         if self.debug:
                             top_token_text = self.tokenizer.decode([top_token_id])
                     else:
                         # No valid first tokens found - force newline
                         if self.newline_token:
+                            self._apply_whitelist_inplace(scores, [self.newline_token])
                 else:
                     # Empty prefix not in tree - force newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # We've started generating a language, continue with prefix tree constraints
                 # Check if current token sequence is complete (allows newline)
                 if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
                     # Complete language, allow newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
                 else:
                     # Not complete, allow valid continuation tokens
                     allowed = self._get_allowed_language_tokens()
                     if allowed:
+                        self._apply_whitelist_inplace(scores, allowed)
                     else:
                         # No valid tokens found - force newline to end field
                         if self.newline_token:
+                            self._apply_whitelist_inplace(scores, [self.newline_token])
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "timesignature"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if current token sequence is complete (allows newline)
             if token_prefix in self.timesig_prefix_tree and self.newline_token in self.timesig_prefix_tree[token_prefix]:
                 # Complete value, allow newline
                 if self.newline_token:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_timesig_tokens()
+                self._apply_whitelist_inplace(scores, allowed)
         return scores

acestep/gradio_ui/event.py DELETED Viewed

The diff for this file is too large to render. See raw diff

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -254,48 +254,84 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         ]
     )
-    # Save buttons for audio 1 and 2
-    for btn_idx, btn_key in [(1, "save_btn_1"), (2, "save_btn_2")]:
-        results_section[btn_key].click(
-            fn=res_h.save_audio_and_metadata,
             inputs=[
                 results_section[f"generated_audio_{btn_idx}"],
-                generation_section["task_type"],
-                generation_section["captions"],
-                generation_section["lyrics"],
-                generation_section["vocal_language"],
-                generation_section["bpm"],
-                generation_section["key_scale"],
-                generation_section["time_signature"],
-                generation_section["audio_duration"],
-                generation_section["batch_size_input"],
-                generation_section["inference_steps"],
-                generation_section["guidance_scale"],
-                generation_section["seed"],
-                generation_section["random_seed_checkbox"],
-                generation_section["use_adg"],
-                generation_section["cfg_interval_start"],
-                generation_section["cfg_interval_end"],
-                generation_section["audio_format"],
-                generation_section["lm_temperature"],
-                generation_section["lm_cfg_scale"],
-                generation_section["lm_top_k"],
-                generation_section["lm_top_p"],
-                generation_section["lm_negative_prompt"],
-                generation_section["use_cot_caption"],
-                generation_section["use_cot_language"],
-                generation_section["audio_cover_strength"],
-                generation_section["think_checkbox"],
-                generation_section["text2music_audio_code_string"],
-                generation_section["repainting_start"],
-                generation_section["repainting_end"],
-                generation_section["track_name"],
-                generation_section["complete_track_classes"],
-                results_section["lm_metadata_state"],
             ],
-            outputs=[gr.File(label="Download Package", visible=False)]
-        )
     # ========== Send to SRC Handlers ==========
     for btn_idx in range(1, 9):
         results_section[f"send_to_src_btn_{btn_idx}"].click(
@@ -331,10 +367,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             ],
             outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
         )
     # ========== Generation Handler ==========
     generation_section["generate_btn"].click(
-        fn=lambda *args: res_h.generate_with_batch_management(dit_handler, llm_handler, *args),
         inputs=[
             generation_section["captions"],
             generation_section["lyrics"],

         ]
     )
+    # Save buttons for all 8 audio outputs
+    download_existing_js = """(current_audio, batch_files) => {
+    // Debug: print what the input actually is
+    console.log("👉 [Debug] Current Audio Input:", current_audio);
+    // 1. Safety check
+    if (!current_audio) {
+        console.warn("⚠️ No audio selected or audio is empty.");
+        return;
+    }
+    if (!batch_files || !Array.isArray(batch_files)) {
+        console.warn("⚠️ Batch file list is empty/not ready.");
+        return;
+    }
+    // 2. Smartly extract path string
+    let pathString = "";
+    if (typeof current_audio === "string") {
+        // Case A: direct path string received
+        pathString = current_audio;
+    } else if (typeof current_audio === "object") {
+        // Case B: an object is received, try common properties
+        // Gradio file objects usually have path, url, or name
+        pathString = current_audio.path || current_audio.name || current_audio.url || "";
+    }
+    if (!pathString) {
+        console.error("❌ Error: Could not extract a valid path string from input.", current_audio);
+        return;
+    }
+    // 3. Extract Key (UUID)
+    // Path could be /tmp/.../uuid.mp3 or url like /file=.../uuid.mp3
+    let filename = pathString.split(/[\\\\/]/).pop(); // get the filename
+    let key = filename.split('.')[0]; // get UUID without extension
+    console.log(`🔑 Key extracted: ${key}`);
+    // 4. Find matching file(s) in the list
+    let targets = batch_files.filter(f => {
+        // Also extract names from batch_files objects
+        // f usually contains name (backend path) and orig_name (download name)
+        const fPath = f.name || f.path || "";
+        return fPath.includes(key);
+    });
+    if (targets.length === 0) {
+        console.warn("❌ No matching files found in batch list for key:", key);
+        alert("Batch list does not contain this file yet. Please wait for generation to finish.");
+        return;
+    }
+    // 5. Trigger download(s)
+    console.log(`🎯 Found ${targets.length} files to download.`);
+    targets.forEach((f, index) => {
+        setTimeout(() => {
+            const a = document.createElement('a');
+            // Prefer url (frontend-accessible link), otherwise try data
+            a.href = f.url || f.data;
+            a.download = f.orig_name || "download";
+            a.style.display = 'none';
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+        }, index * 1000); // 300ms interval to avoid browser blocking
+    });
+}
+"""
+    for btn_idx in range(1, 9):
+        results_section[f"save_btn_{btn_idx}"].click(
+            fn=None,
             inputs=[
                 results_section[f"generated_audio_{btn_idx}"],
+                results_section["generated_audio_batch"],
             ],
+        js=download_existing_js  # Run the above JS
+    )
     # ========== Send to SRC Handlers ==========
     for btn_idx in range(1, 9):
         results_section[f"send_to_src_btn_{btn_idx}"].click(
             ],
             outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
         )
+    def generation_wrapper(*args):
+        yield from res_h.generate_with_batch_management(dit_handler, llm_handler, *args)
     # ========== Generation Handler ==========
     generation_section["generate_btn"].click(
+        fn=generation_wrapper,
         inputs=[
             generation_section["captions"],
             generation_section["lyrics"],

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -10,9 +10,123 @@ import tempfile
 import shutil
 import zipfile
 import time as time_module
 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
 def store_batch_in_queue(
@@ -66,99 +180,6 @@ def update_navigation_buttons(current_batch, total_batches):
     can_go_next = current_batch < total_batches - 1
     return can_go_previous, can_go_next
-def save_audio_and_metadata(
-    audio_path, task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature, audio_duration,
-    batch_size_input, inference_steps, guidance_scale, seed, random_seed_checkbox,
-    use_adg, cfg_interval_start, cfg_interval_end, audio_format,
-    lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
-    use_cot_caption, use_cot_language, audio_cover_strength,
-    think_checkbox, text2music_audio_code_string, repainting_start, repainting_end,
-    track_name, complete_track_classes, lm_metadata
-):
-    """Save audio file and its metadata as a zip package"""
-    if audio_path is None:
-        gr.Warning(t("messages.no_audio_to_save"))
-        return None
-    try:
-        # Create metadata dictionary
-        metadata = {
-            "saved_at": datetime.datetime.now().isoformat(),
-            "task_type": task_type,
-            "caption": captions or "",
-            "lyrics": lyrics or "",
-            "vocal_language": vocal_language,
-            "bpm": bpm if bpm is not None else None,
-            "keyscale": key_scale or "",
-            "timesignature": time_signature or "",
-            "duration": audio_duration if audio_duration is not None else -1,
-            "batch_size": batch_size_input,
-            "inference_steps": inference_steps,
-            "guidance_scale": guidance_scale,
-            "seed": seed,
-            "random_seed": False,  # Disable random seed for reproducibility
-            "use_adg": use_adg,
-            "cfg_interval_start": cfg_interval_start,
-            "cfg_interval_end": cfg_interval_end,
-            "audio_format": audio_format,
-            "lm_temperature": lm_temperature,
-            "lm_cfg_scale": lm_cfg_scale,
-            "lm_top_k": lm_top_k,
-            "lm_top_p": lm_top_p,
-            "lm_negative_prompt": lm_negative_prompt,
-            "use_cot_caption": use_cot_caption,
-            "use_cot_language": use_cot_language,
-            "audio_cover_strength": audio_cover_strength,
-            "think": think_checkbox,
-            "audio_codes": text2music_audio_code_string or "",
-            "repainting_start": repainting_start,
-            "repainting_end": repainting_end,
-            "track_name": track_name,
-            "complete_track_classes": complete_track_classes or [],
-        }
-        # Add LM-generated metadata if available
-        if lm_metadata:
-            metadata["lm_generated_metadata"] = lm_metadata
-        # Generate timestamp and base name
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Extract audio filename extension
-        audio_ext = os.path.splitext(audio_path)[1]
-        # Create temporary directory for packaging
-        temp_dir = tempfile.mkdtemp()
-        # Save JSON metadata
-        json_path = os.path.join(temp_dir, f"metadata_{timestamp}.json")
-        with open(json_path, 'w', encoding='utf-8') as f:
-            json.dump(metadata, f, indent=2, ensure_ascii=False)
-        # Copy audio file
-        audio_copy_path = os.path.join(temp_dir, f"audio_{timestamp}{audio_ext}")
-        shutil.copy2(audio_path, audio_copy_path)
-        # Create zip file
-        zip_path = os.path.join(tempfile.gettempdir(), f"music_package_{timestamp}.zip")
-        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-            zipf.write(audio_copy_path, os.path.basename(audio_copy_path))
-            zipf.write(json_path, os.path.basename(json_path))
-        # Clean up temp directory
-        shutil.rmtree(temp_dir)
-        gr.Info(t("messages.save_success", filename=os.path.basename(zip_path)))
-        return zip_path
-    except Exception as e:
-        gr.Warning(t("messages.save_failed", error=str(e)))
-        import traceback
-        traceback.print_exc()
-        return None
 def send_audio_to_src_with_metadata(audio_file, lm_metadata):
     """Send generated audio file to src_audio input and populate metadata fields
@@ -254,366 +275,209 @@ def generate_with_progress(
     auto_score,
     score_scale,
     lm_batch_chunk_size,
-    progress=gr.Progress(track_tqdm=True)
 ):
     """Generate audio with progress tracking"""
-    # If think is enabled (llm_dit mode) and use_cot_metas is True, generate audio codes using LM first
-    audio_code_string_to_use = text2music_audio_code_string
-    lm_generated_metadata = None  # Store LM-generated metadata for display
-    lm_generated_audio_codes = None  # Store LM-generated audio codes for display
-    lm_generated_audio_codes_list = []  # Store list of audio codes for batch processing
-    # Determine if we should use batch LM generation
-    should_use_lm_batch = (
-        think_checkbox and
-        llm_handler.llm_initialized and
-        use_cot_metas and
-        allow_lm_batch and
-        batch_size_input >= 2
     )
-    if think_checkbox and llm_handler.llm_initialized and use_cot_metas:
-        # Convert top_k: 0 means None (disabled)
-        top_k_value = None if lm_top_k == 0 else int(lm_top_k)
-        # Convert top_p: 1.0 means None (disabled)
-        top_p_value = None if lm_top_p >= 1.0 else lm_top_p
-        # Build user_metadata from user-provided values (only include non-empty values)
-        user_metadata = {}
-        # Handle bpm: gr.Number can be None, int, float, or string
-        if bpm is not None:
-            try:
-                bpm_value = float(bpm)
-                if bpm_value > 0:
-                    user_metadata['bpm'] = str(int(bpm_value))
-            except (ValueError, TypeError):
-                # If bpm is not a valid number, skip it
-                pass
-        if key_scale and key_scale.strip():
-            key_scale_clean = key_scale.strip()
-            if key_scale_clean.lower() not in ["n/a", ""]:
-                user_metadata['keyscale'] = key_scale_clean
-        if time_signature and time_signature.strip():
-            time_sig_clean = time_signature.strip()
-            if time_sig_clean.lower() not in ["n/a", ""]:
-                user_metadata['timesignature'] = time_sig_clean
-        if audio_duration is not None:
-            try:
-                duration_value = float(audio_duration)
-                if duration_value > 0:
-                    user_metadata['duration'] = str(int(duration_value))
-            except (ValueError, TypeError):
-                # If audio_duration is not a valid number, skip it
-                pass
-        # Only pass user_metadata if user provided any values, otherwise let LM generate
-        user_metadata_to_pass = user_metadata if user_metadata else None
-        if should_use_lm_batch:
-            # BATCH LM GENERATION
-            logger.info(f"Using LM batch generation for {batch_size_input} items...")
-            # Prepare seeds for batch items
-            actual_seed_list, _ = dit_handler.prepare_seeds(batch_size_input, seed, random_seed_checkbox)
-            # Split batch into chunks (GPU memory constraint)
-            max_inference_batch_size = int(lm_batch_chunk_size)
-            num_chunks = math.ceil(batch_size_input / max_inference_batch_size)
-            all_metadata_list = []
-            all_audio_codes_list = []
-            for chunk_idx in range(num_chunks):
-                chunk_start = chunk_idx * max_inference_batch_size
-                chunk_end = min(chunk_start + max_inference_batch_size, batch_size_input)
-                chunk_size = chunk_end - chunk_start
-                chunk_seeds = actual_seed_list[chunk_start:chunk_end]
-                logger.info(f"Generating LM batch chunk {chunk_idx+1}/{num_chunks} (size: {chunk_size}, seeds: {chunk_seeds})...")
-                # Generate batch
-                metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition_batch(
-                    caption=captions or "",
-                    lyrics=lyrics or "",
-                    batch_size=chunk_size,
-                    infer_type="llm_dit",
-                    temperature=lm_temperature,
-                    cfg_scale=lm_cfg_scale,
-                    negative_prompt=lm_negative_prompt,
-                    top_k=top_k_value,
-                    top_p=top_p_value,
-                    user_metadata=user_metadata_to_pass,
-                    use_cot_caption=use_cot_caption,
-                    use_cot_language=use_cot_language,
-                    is_format_caption=is_format_caption,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    seeds=chunk_seeds,
-                )
-                all_metadata_list.extend(metadata_list)
-                all_audio_codes_list.extend(audio_codes_list)
-            # Use first metadata as representative (all are same)
-            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
-            # Store audio codes list for later use
-            lm_generated_audio_codes_list = all_audio_codes_list
-            # Prepare audio codes for DiT (list of codes, one per batch item)
-            audio_code_string_to_use = all_audio_codes_list
-            # Update metadata fields from LM if not provided by user
-            if lm_generated_metadata:
-                if bpm is None and lm_generated_metadata.get('bpm'):
-                    bpm_value = lm_generated_metadata.get('bpm')
-                    if bpm_value != "N/A" and bpm_value != "":
-                        try:
-                            bpm = int(bpm_value)
-                        except:
-                            pass
-                if not key_scale and lm_generated_metadata.get('keyscale'):
-                    key_scale_value = lm_generated_metadata.get('keyscale', lm_generated_metadata.get('key_scale', ""))
-                    if key_scale_value != "N/A":
-                        key_scale = key_scale_value
-                if not time_signature and lm_generated_metadata.get('timesignature'):
-                    time_signature_value = lm_generated_metadata.get('timesignature', lm_generated_metadata.get('time_signature', ""))
-                    if time_signature_value != "N/A":
-                        time_signature = time_signature_value
-                if audio_duration is None or audio_duration <= 0:
-                    audio_duration_value = lm_generated_metadata.get('duration', -1)
-                    if audio_duration_value != "N/A" and audio_duration_value != "":
-                        try:
-                            audio_duration = float(audio_duration_value)
-                        except:
-                            pass
-        else:
-            # SEQUENTIAL LM GENERATION (current behavior, when allow_lm_batch is False)
-            # Phase 1: Generate CoT metadata
-            phase1_start = time_module.time()
-            metadata, _, status = llm_handler.generate_with_stop_condition(
-                caption=captions or "",
-                lyrics=lyrics or "",
-                infer_type="dit",  # Only generate metadata in Phase 1
-                temperature=lm_temperature,
-                cfg_scale=lm_cfg_scale,
-                negative_prompt=lm_negative_prompt,
-                top_k=top_k_value,
-                top_p=top_p_value,
-                user_metadata=user_metadata_to_pass,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
-                constrained_decoding_debug=constrained_decoding_debug,
-            )
-            lm_phase1_time = time_module.time() - phase1_start
-            logger.info(f"LM Phase 1 (CoT) completed in {lm_phase1_time:.2f}s")
-            # Phase 2: Generate audio codes
-            phase2_start = time_module.time()
-            metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
-                caption=captions or "",
-                lyrics=lyrics or "",
-                infer_type="llm_dit",  # Generate both metadata and codes
-                temperature=lm_temperature,
-                cfg_scale=lm_cfg_scale,
-                negative_prompt=lm_negative_prompt,
-                top_k=top_k_value,
-                top_p=top_p_value,
-                user_metadata=user_metadata_to_pass,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
-                constrained_decoding_debug=constrained_decoding_debug,
             )
-            lm_phase2_time = time_module.time() - phase2_start
-            logger.info(f"LM Phase 2 (Codes) completed in {lm_phase2_time:.2f}s")
-            # Store LM-generated metadata and audio codes for display
-            lm_generated_metadata = metadata
-            if audio_codes:
-                audio_code_string_to_use = audio_codes
-                lm_generated_audio_codes = audio_codes
-                # Update metadata fields only if they are empty/None (user didn't provide them)
-                if bpm is None and metadata.get('bpm'):
-                    bpm_value = metadata.get('bpm')
-                    if bpm_value != "N/A" and bpm_value != "":
-                        try:
-                            bpm = int(bpm_value)
-                        except:
-                            pass
-                if not key_scale and metadata.get('keyscale'):
-                    key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
-                    if key_scale_value != "N/A":
-                        key_scale = key_scale_value
-                if not time_signature and metadata.get('timesignature'):
-                    time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
-                    if time_signature_value != "N/A":
-                        time_signature = time_signature_value
-                if audio_duration is None or audio_duration <= 0:
-                    audio_duration_value = metadata.get('duration', -1)
-                    if audio_duration_value != "N/A" and audio_duration_value != "":
-                        try:
-                            audio_duration = float(audio_duration_value)
-                        except:
-                            pass
-    # Call generate_music and get results
-    result = dit_handler.generate_music(
-        captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
-        time_signature=time_signature, vocal_language=vocal_language,
-        inference_steps=inference_steps, guidance_scale=guidance_scale,
-        use_random_seed=random_seed_checkbox, seed=seed,
-        reference_audio=reference_audio, audio_duration=audio_duration,
-        batch_size=batch_size_input, src_audio=src_audio,
-        audio_code_string=audio_code_string_to_use,
-        repainting_start=repainting_start, repainting_end=repainting_end,
-        instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
-        task_type=task_type, use_adg=use_adg,
-        cfg_interval_start=cfg_interval_start, cfg_interval_end=cfg_interval_end,
-        audio_format=audio_format, lm_temperature=lm_temperature,
-        progress=progress
-    )
-    # Extract results
-    first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
-        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
-    # Extract LM timing from status if available and prepend to generation_info
-    if status:
-        import re
-        # Try to extract timing info from status using regex
-        # Expected format: "Phase1: X.XXs" and "Phase2: X.XXs"
-        phase1_match = re.search(r'Phase1:\s*([\d.]+)s', status)
-        phase2_match = re.search(r'Phase2:\s*([\d.]+)s', status)
-        if phase1_match or phase2_match:
-            lm_timing_section = "\n\n**🤖 LM Timing:**\n"
-            lm_total = 0.0
-            if phase1_match:
-                phase1_time = float(phase1_match.group(1))
-                lm_timing_section += f"  - Phase 1 (CoT Metadata): {phase1_time:.2f}s\n"
-                lm_total += phase1_time
-            if phase2_match:
-                phase2_time = float(phase2_match.group(1))
-                lm_timing_section += f"  - Phase 2 (Audio Codes): {phase2_time:.2f}s\n"
-                lm_total += phase2_time
-            if lm_total > 0:
-                lm_timing_section += f"  - Total LM Time: {lm_total:.2f}s\n"
-            generation_info = lm_timing_section + "\n" + generation_info
-    # Append LM-generated metadata to generation_info if available
-    if lm_generated_metadata:
-        metadata_lines = []
-        if lm_generated_metadata.get('bpm'):
-            metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
-        if lm_generated_metadata.get('caption'):
-            metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
-        if lm_generated_metadata.get('duration'):
-            metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
-        if lm_generated_metadata.get('keyscale'):
-            metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
-        if lm_generated_metadata.get('language'):
-            metadata_lines.append(f"- **Language:** {lm_generated_metadata['language']}")
-        if lm_generated_metadata.get('timesignature'):
-            metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
-        if metadata_lines:
-            metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
-            generation_info = metadata_section + "\n\n" + generation_info
-    # Update audio codes in UI if LM generated them
-    codes_outputs = [""] * 8  # Codes for 8 components
-    if should_use_lm_batch and lm_generated_audio_codes_list:
-        # Batch mode: update individual codes inputs
-        for idx in range(min(len(lm_generated_audio_codes_list), 8)):
-            codes_outputs[idx] = lm_generated_audio_codes_list[idx]
-        # For single codes input, show first one
-        updated_audio_codes = lm_generated_audio_codes_list[0] if lm_generated_audio_codes_list else text2music_audio_code_string
-    else:
-        # Single mode: update main codes input
-        updated_audio_codes = lm_generated_audio_codes if lm_generated_audio_codes else text2music_audio_code_string
-    # AUTO-SCORING
-    score_displays = [""] * 8  # Scores for 8 components
-    if auto_score and all_audio_paths:
-        logger.info(f"Auto-scoring enabled, calculating quality scores for {batch_size_input} generated audios...")
-        # Determine which audio codes to use for scoring
-        if should_use_lm_batch and lm_generated_audio_codes_list:
-            codes_list = lm_generated_audio_codes_list
-        elif audio_code_string_to_use and isinstance(audio_code_string_to_use, list):
-            codes_list = audio_code_string_to_use
         else:
-            # Single code string, replicate for all audios
-            codes_list = [audio_code_string_to_use] * len(all_audio_paths)
-        # Calculate scores only for actually generated audios (up to batch_size_input)
-        # Don't score beyond the actual batch size to avoid duplicates
-        actual_audios_to_score = min(len(all_audio_paths), int(batch_size_input))
-        for idx in range(actual_audios_to_score):
-            if idx < len(codes_list) and codes_list[idx]:
-                try:
-                    score_display = calculate_score_handler(
-                        llm_handler,
-                        codes_list[idx],
-                        captions,
-                        lyrics,
-                        lm_generated_metadata,
-                        bpm, key_scale, time_signature, audio_duration, vocal_language,
-                        score_scale
-                    )
-                    score_displays[idx] = score_display
-                    logger.info(f"Auto-scored audio {idx+1}")
-                except Exception as e:
-                    logger.error(f"Auto-scoring failed for audio {idx+1}: {e}")
-                    score_displays[idx] = f"❌ Auto-scoring failed: {str(e)}"
-    # Prepare audio outputs (up to 8)
-    audio_outputs = [None] * 8
-    for idx in range(min(len(all_audio_paths), 8)):
-        audio_outputs[idx] = all_audio_paths[idx]
-    return (
-        audio_outputs[0],  # generated_audio_1
-        audio_outputs[1],  # generated_audio_2
-        audio_outputs[2],  # generated_audio_3
-        audio_outputs[3],  # generated_audio_4
-        audio_outputs[4],  # generated_audio_5
-        audio_outputs[5],  # generated_audio_6
-        audio_outputs[6],  # generated_audio_7
-        audio_outputs[7],  # generated_audio_8
-        all_audio_paths,   # generated_audio_batch
         generation_info,
-        status_message,
         seed_value_for_ui,
-        align_score_1,
-        align_text_1,
-        align_plot_1,
-        align_score_2,
-        align_text_2,
-        align_plot_2,
-        score_displays[0],  # score_display_1
-        score_displays[1],  # score_display_2
-        score_displays[2],  # score_display_3
-        score_displays[3],  # score_display_4
-        score_displays[4],  # score_display_5
-        score_displays[5],  # score_display_6
-        score_displays[6],  # score_display_7
-        score_displays[7],  # score_display_8
-        updated_audio_codes,  # Update main audio codes in UI
-        codes_outputs[0],  # text2music_audio_code_string_1
-        codes_outputs[1],  # text2music_audio_code_string_2
-        codes_outputs[2],  # text2music_audio_code_string_3
-        codes_outputs[3],  # text2music_audio_code_string_4
-        codes_outputs[4],  # text2music_audio_code_string_5
-        codes_outputs[5],  # text2music_audio_code_string_6
-        codes_outputs[6],  # text2music_audio_code_string_7
-        codes_outputs[7],  # text2music_audio_code_string_8
-        lm_generated_metadata,  # Store metadata for "Send to src audio" buttons
-        is_format_caption,  # Keep is_format_caption unchanged
     )
 def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
     """
     Calculate PMI-based quality score for generated audio.
@@ -756,7 +620,9 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
     if stored_allow_lm_batch and isinstance(stored_codes, list):
         # Batch mode: use specific sample's codes
         if 0 <= sample_idx - 1 < len(stored_codes):
-            audio_codes_str = stored_codes[sample_idx - 1]
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
@@ -868,7 +734,7 @@ def generate_with_batch_management(
     Wrapper for generate_with_progress that adds batch queue management
     """
     # Call the original generation function
-    result = generate_with_progress(
         dit_handler, llm_handler,
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
         inference_steps, guidance_scale, random_seed_checkbox, seed,
@@ -885,23 +751,41 @@ def generate_with_batch_management(
         lm_batch_chunk_size,
         progress
     )
-    # Extract results from generation
-    all_audio_paths = result[8]  # generated_audio_batch
     generation_info = result[9]
     seed_value_for_ui = result[11]
-    lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
     # Extract codes
     generated_codes_single = result[26]
     generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
         codes_to_store = generated_codes_batch[:int(batch_size_input)]
     else:
         codes_to_store = generated_codes_single
     # Save parameters for history
     saved_params = {
         "captions": captions,
@@ -947,6 +831,7 @@ def generate_with_batch_management(
     }
     # Next batch parameters (with cleared codes & random seed)
     next_params = saved_params.copy()
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
@@ -979,9 +864,10 @@ def generate_with_batch_management(
     next_batch_status_text = ""
     if autogen_checkbox:
         next_batch_status_text = t("messages.autogen_enabled")
-    # Return original results plus batch management state updates
-    return result + (
         current_batch_index,
         total_batches,
         batch_queue,
@@ -1097,7 +983,8 @@ def generate_next_batch_background(
         params.setdefault("complete_track_classes", [])
         # Call generate_with_progress with the saved parameters
-        result = generate_with_progress(
             dit_handler,
             llm_handler,
             captions=params.get("captions"),
@@ -1142,15 +1029,20 @@ def generate_next_batch_background(
             progress=progress
         )
-        # Extract results
-        all_audio_paths = result[8]  # generated_audio_batch
-        generation_info = result[9]
-        seed_value_for_ui = result[11]
-        lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
         # Extract codes
-        generated_codes_single = result[26]
-        generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)
@@ -1240,8 +1132,9 @@ def navigate_to_previous_batch(current_batch_index, batch_queue):
     # Prepare audio outputs (up to 8)
     audio_outputs = [None] * 8
-    for idx in range(min(len(audio_paths), 8)):
-        audio_outputs[idx] = audio_paths[idx]
     # Update batch indicator
     total_batches = len(batch_queue)
@@ -1286,8 +1179,9 @@ def navigate_to_next_batch(autogen_enabled, current_batch_index, total_batches,
     # Prepare audio outputs (up to 8)
     audio_outputs = [None] * 8
-    for idx in range(min(len(audio_paths), 8)):
-        audio_outputs[idx] = audio_paths[idx]
     # Update batch indicator
     batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)

 import shutil
 import zipfile
 import time as time_module
+from typing import Dict, Any, Optional
 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
+from acestep.inference import generate_music, GenerationParams, GenerationConfig
+from acestep.audio_utils import save_audio
+def _build_generation_info(
+    lm_metadata: Optional[Dict[str, Any]],
+    time_costs: Dict[str, float],
+    seed_value: str,
+    inference_steps: int,
+    num_audios: int,
+) -> str:
+    """Build generation info string from result data.
+    Args:
+        lm_metadata: LM-generated metadata dictionary
+        time_costs: Unified time costs dictionary
+        seed_value: Seed value string
+        inference_steps: Number of inference steps
+        num_audios: Number of generated audios
+    Returns:
+        Formatted generation info string
+    """
+    info_parts = []
+    # Part 1: LM-generated metadata (if available)
+    if lm_metadata:
+        metadata_lines = []
+        if lm_metadata.get('bpm'):
+            metadata_lines.append(f"- **BPM:** {lm_metadata['bpm']}")
+        if lm_metadata.get('caption'):
+            metadata_lines.append(f"- **Refined Caption:** {lm_metadata['caption']}")
+        if lm_metadata.get('lyrics'):
+            metadata_lines.append(f"- **Refined Lyrics:** {lm_metadata['lyrics']}")
+        if lm_metadata.get('duration'):
+            metadata_lines.append(f"- **Duration:** {lm_metadata['duration']} seconds")
+        if lm_metadata.get('keyscale'):
+            metadata_lines.append(f"- **Key Scale:** {lm_metadata['keyscale']}")
+        if lm_metadata.get('language'):
+            metadata_lines.append(f"- **Language:** {lm_metadata['language']}")
+        if lm_metadata.get('timesignature'):
+            metadata_lines.append(f"- **Time Signature:** {lm_metadata['timesignature']}")
+        if metadata_lines:
+            metadata_section = "**🤖 LM-Generated Metadata:**\n" + "\n".join(metadata_lines)
+            info_parts.append(metadata_section)
+    # Part 2: Time costs (formatted and beautified)
+    if time_costs:
+        time_lines = []
+        # LM time costs
+        lm_phase1 = time_costs.get('lm_phase1_time', 0.0)
+        lm_phase2 = time_costs.get('lm_phase2_time', 0.0)
+        lm_total = time_costs.get('lm_total_time', 0.0)
+        if lm_total > 0:
+            time_lines.append("**🧠 LM Time:**")
+            if lm_phase1 > 0:
+                time_lines.append(f"  - Phase 1 (CoT): {lm_phase1:.2f}s")
+            if lm_phase2 > 0:
+                time_lines.append(f"  - Phase 2 (Codes): {lm_phase2:.2f}s")
+            time_lines.append(f"  - Total: {lm_total:.2f}s")
+        # DiT time costs
+        dit_encoder = time_costs.get('dit_encoder_time_cost', 0.0)
+        dit_model = time_costs.get('dit_model_time_cost', 0.0)
+        dit_vae_decode = time_costs.get('dit_vae_decode_time_cost', 0.0)
+        dit_offload = time_costs.get('dit_offload_time_cost', 0.0)
+        dit_total = time_costs.get('dit_total_time_cost', 0.0)
+        if dit_total > 0:
+            time_lines.append("\n**🎵 DiT Time:**")
+            if dit_encoder > 0:
+                time_lines.append(f"  - Encoder: {dit_encoder:.2f}s")
+            if dit_model > 0:
+                time_lines.append(f"  - Model: {dit_model:.2f}s")
+            if dit_vae_decode > 0:
+                time_lines.append(f"  - VAE Decode: {dit_vae_decode:.2f}s")
+            if dit_offload > 0:
+                time_lines.append(f"  - Offload: {dit_offload:.2f}s")
+            time_lines.append(f"  - Total: {dit_total:.2f}s")
+        # Post-processing time costs
+        audio_conversion_time = time_costs.get('audio_conversion_time', 0.0)
+        auto_score_time = time_costs.get('auto_score_time', 0.0)
+        if audio_conversion_time > 0 or auto_score_time > 0:
+            time_lines.append("\n**🔧 Post-processing Time:**")
+            if audio_conversion_time > 0:
+                time_lines.append(f"  - Audio Conversion: {audio_conversion_time:.2f}s")
+            if auto_score_time > 0:
+                time_lines.append(f"  - Auto Score: {auto_score_time:.2f}s")
+        # Pipeline total
+        pipeline_total = time_costs.get('pipeline_total_time', 0.0)
+        if pipeline_total > 0:
+            time_lines.append(f"\n**⏱️ Pipeline Total: {pipeline_total:.2f}s**")
+        if time_lines:
+            time_section = "\n".join(time_lines)
+            info_parts.append(time_section)
+    # Part 3: Generation summary
+    summary_lines = [
+        "**🎵 Generation Complete**",
+        f"  - **Seeds:** {seed_value}",
+        f"  - **Steps:** {inference_steps}",
+        f"  - **Audio Count:** {num_audios} audio(s)",
+    ]
+    info_parts.append("\n".join(summary_lines))
+    # Combine all parts
+    return "\n\n".join(info_parts)
 def store_batch_in_queue(
     can_go_next = current_batch < total_batches - 1
     return can_go_previous, can_go_next
 def send_audio_to_src_with_metadata(audio_file, lm_metadata):
     """Send generated audio file to src_audio input and populate metadata fields
     auto_score,
     score_scale,
     lm_batch_chunk_size,
+    progress=gr.Progress(track_tqdm=True),
 ):
     """Generate audio with progress tracking"""
+    # step 1: prepare inputs
+    # generate_music, GenerationParams, GenerationConfig
+    gen_params = GenerationParams(
+        task_type=task_type,
+        instruction=instruction_display_gen,
+        reference_audio=reference_audio,
+        src_audio=src_audio,
+        audio_codes=text2music_audio_code_string if not think_checkbox else "",
+        caption=captions or "",
+        lyrics=lyrics or "",
+        instrumental=False,
+        vocal_language=vocal_language,
+        bpm=bpm,
+        keyscale=key_scale,
+        timesignature=time_signature,
+        duration=audio_duration,
+        inference_steps=inference_steps,
+        guidance_scale=guidance_scale,
+        use_adg=use_adg,
+        cfg_interval_start=cfg_interval_start,
+        cfg_interval_end=cfg_interval_end,
+        repainting_start=repainting_start,
+        repainting_end=repainting_end,
+        audio_cover_strength=audio_cover_strength,
+        thinking=think_checkbox,
+        lm_temperature=lm_temperature,
+        lm_cfg_scale=lm_cfg_scale,
+        lm_top_k=lm_top_k,
+        lm_top_p=lm_top_p,
+        lm_negative_prompt=lm_negative_prompt,
+        use_cot_metas=use_cot_metas,
+        use_cot_caption=use_cot_caption,
+        use_cot_language=use_cot_language,
+        use_constrained_decoding=True,
+    )
+    # seed string to list
+    if isinstance(seed, str) and seed.strip():
+        if "," in seed:
+            seed_list = [int(s.strip()) for s in seed.split(",")]
+        else:
+            seed_list = [int(seed.strip())]
+    else:
+        seed_list = None
+    gen_config = GenerationConfig(
+        batch_size=batch_size_input,
+        allow_lm_batch=allow_lm_batch,
+        use_random_seed=random_seed_checkbox,
+        seeds=seed_list,
+        lm_batch_chunk_size=lm_batch_chunk_size,
+        constrained_decoding_debug=constrained_decoding_debug,
+        audio_format=audio_format,
+    )
+    result = generate_music(
+        dit_handler,
+        llm_handler,
+        params=gen_params,
+        config=gen_config,
+        progress=progress,
     )
+    audio_outputs = [None] * 8
+    all_audio_paths = []
+    final_codes_list = [""] * 8
+    final_scores_list = [""] * 8
+    # Build generation_info from result data
+    status_message = result.status_message
+    seed_value_for_ui = result.extra_outputs.get("seed_value", "")
+    lm_generated_metadata = result.extra_outputs.get("lm_metadata", {})
+    time_costs = result.extra_outputs.get("time_costs", {}).copy()
+    # Initialize post-processing timing
+    audio_conversion_start_time = time_module.time()
+    total_auto_score_time = 0.0
+    align_score_1 = ""
+    align_text_1 = ""
+    align_plot_1 = None
+    align_score_2 = ""
+    align_text_2 = ""
+    align_plot_2 = None
+    updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
+    # Build initial generation_info (will be updated with post-processing times at the end)
+    generation_info = _build_generation_info(
+        lm_metadata=lm_generated_metadata,
+        time_costs=time_costs,
+        seed_value=seed_value_for_ui,
+        inference_steps=inference_steps,
+        num_audios=len(result.audios) if result.success else 0,
+    )
+    if not result.success:
+        yield (None,) * 8 + (None, generation_info, result.status_message) + (gr.skip(),) * 26
+        return
+    audios = result.audios
+    progress(0.99, "Converting audio to mp3...")
+    for i in range(8):
+        if i < len(audios):
+            key = audios[i]["key"]
+            audio_tensor = audios[i]["tensor"]
+            sample_rate = audios[i]["sample_rate"]
+            audio_params = audios[i]["params"]
+            temp_dir = tempfile.mkdtemp(f"acestep_gradio_results/")
+            os.makedirs(temp_dir, exist_ok=True)
+            json_path = os.path.join(temp_dir, f"{key}.json")
+            audio_path = os.path.join(temp_dir, f"{key}.{audio_format}")
+            save_audio(audio_data=audio_tensor, output_path=audio_path, sample_rate=sample_rate, format=audio_format, channels_first=True)
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(audio_params, f, indent=2, ensure_ascii=False)
+            audio_outputs[i] = audio_path
+            all_audio_paths.append(audio_path)
+            all_audio_paths.append(json_path)
+            code_str = audio_params.get("audio_codes", "")
+            final_codes_list[i] = code_str
+            scores_ui_updates = [gr.skip()] * 8
+            score_str = "Done!"
+            if auto_score:
+                auto_score_start = time_module.time()
+                score_str = calculate_score_handler(llm_handler, code_str, captions, lyrics, lm_generated_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale)
+                auto_score_end = time_module.time()
+                total_auto_score_time += (auto_score_end - auto_score_start)
+            scores_ui_updates[i] = score_str
+            final_scores_list[i] = score_str
+            status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
+            current_audio_updates = [gr.skip()] * 8
+            current_audio_updates[i] = audio_path
+            audio_codes_ui_updates = [gr.skip()] * 8
+            audio_codes_ui_updates[i] = code_str
+            yield (
+                current_audio_updates[0], current_audio_updates[1], current_audio_updates[2], current_audio_updates[3],
+                current_audio_updates[4], current_audio_updates[5], current_audio_updates[6], current_audio_updates[7],
+                all_audio_paths,   # Real-time update of Batch File list
+                generation_info,
+                status_message,
+                seed_value_for_ui,
+                # Align plot placeholders (assume no need to update in real time)
+                gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+                # Scores
+                scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
+                updated_audio_codes,
+                # Codes
+                audio_codes_ui_updates[0], audio_codes_ui_updates[1], audio_codes_ui_updates[2], audio_codes_ui_updates[3],
+                audio_codes_ui_updates[4], audio_codes_ui_updates[5], audio_codes_ui_updates[6], audio_codes_ui_updates[7],
+                lm_generated_metadata,
+                is_format_caption,
             )
         else:
+            # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
+            pass
+        time_module.sleep(0.1)
+    # Record audio conversion time
+    audio_conversion_end_time = time_module.time()
+    audio_conversion_time = audio_conversion_end_time - audio_conversion_start_time
+    # Add post-processing times to time_costs
+    if audio_conversion_time > 0:
+        time_costs['audio_conversion_time'] = audio_conversion_time
+    if total_auto_score_time > 0:
+        time_costs['auto_score_time'] = total_auto_score_time
+    # Update pipeline total time to include post-processing
+    if 'pipeline_total_time' in time_costs:
+        time_costs['pipeline_total_time'] += audio_conversion_time + total_auto_score_time
+    # Rebuild generation_info with complete timing information
+    generation_info = _build_generation_info(
+        lm_metadata=lm_generated_metadata,
+        time_costs=time_costs,
+        seed_value=seed_value_for_ui,
+        inference_steps=inference_steps,
+        num_audios=len(result.audios),
+    )
+    yield (
+        gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 1-4: SKIP
+        gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 5-8: SKIP
+        all_audio_paths,
         generation_info,
+        "Generation Complete",
         seed_value_for_ui,
+        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2,
+        final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
+        final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
+        updated_audio_codes,
+        final_codes_list[0], final_codes_list[1], final_codes_list[2], final_codes_list[3],
+        final_codes_list[4], final_codes_list[5], final_codes_list[6], final_codes_list[7],
+        lm_generated_metadata,
+        is_format_caption,
     )
 def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
     """
     Calculate PMI-based quality score for generated audio.
     if stored_allow_lm_batch and isinstance(stored_codes, list):
         # Batch mode: use specific sample's codes
         if 0 <= sample_idx - 1 < len(stored_codes):
+            code_item = stored_codes[sample_idx - 1]
+            # Ensure it's a string (handle cases where dict was mistakenly stored)
+            audio_codes_str = code_item if isinstance(code_item, str) else ""
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
     Wrapper for generate_with_progress that adds batch queue management
     """
     # Call the original generation function
+    generator = generate_with_progress(
         dit_handler, llm_handler,
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
         inference_steps, guidance_scale, random_seed_checkbox, seed,
         lm_batch_chunk_size,
         progress
     )
+    final_result_from_inner = None
+    for partial_result in generator:
+        final_result_from_inner = partial_result
+        # current_batch_index, total_batches, batch_queue, next_params,
+        # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
+        yield partial_result + (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
+        )
+    result = final_result_from_inner
+    all_audio_paths = result[8]
+    if all_audio_paths is None:
+        yield result + (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
+        )
+        return
+    # Extract results from generation (使用 result 下标访问)
     generation_info = result[9]
     seed_value_for_ui = result[11]
+    lm_generated_metadata = result[35]  # Fixed: lm_metadata is at index 35, not 34
     # Extract codes
     generated_codes_single = result[26]
     generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
         codes_to_store = generated_codes_batch[:int(batch_size_input)]
     else:
         codes_to_store = generated_codes_single
     # Save parameters for history
     saved_params = {
         "captions": captions,
     }
     # Next batch parameters (with cleared codes & random seed)
+    # Next batch parameters
     next_params = saved_params.copy()
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
     next_batch_status_text = ""
     if autogen_checkbox:
         next_batch_status_text = t("messages.autogen_enabled")
+    # 4. Yield final result (includes Batch UI updates)
+    # The result here is already a tuple structure
+    yield result + (
         current_batch_index,
         total_batches,
         batch_queue,
         params.setdefault("complete_track_classes", [])
         # Call generate_with_progress with the saved parameters
+        # Note: generate_with_progress is a generator, need to iterate through it
+        generator = generate_with_progress(
             dit_handler,
             llm_handler,
             captions=params.get("captions"),
             progress=progress
         )
+        # Consume generator to get final result (similar to generate_with_batch_management)
+        final_result = None
+        for partial_result in generator:
+            final_result = partial_result
+        # Extract results from final_result
+        all_audio_paths = final_result[8]  # generated_audio_batch
+        generation_info = final_result[9]
+        seed_value_for_ui = final_result[11]
+        lm_generated_metadata = final_result[35]  # Fixed: lm_metadata is at index 35, not 34
         # Extract codes
+        generated_codes_single = final_result[26]
+        generated_codes_batch = [final_result[27], final_result[28], final_result[29], final_result[30], final_result[31], final_result[32], final_result[33], final_result[34]]
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)
     # Prepare audio outputs (up to 8)
     audio_outputs = [None] * 8
+    real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
+    for idx in range(min(len(real_audio_paths), 8)):
+        audio_outputs[idx] = real_audio_paths[idx]
     # Update batch indicator
     total_batches = len(batch_queue)
     # Prepare audio outputs (up to 8)
     audio_outputs = [None] * 8
+    real_audio_paths = [p for p in audio_paths if not p.lower().endswith('.json')]
+    for idx in range(min(len(real_audio_paths), 8)):
+        audio_outputs[idx] = real_audio_paths[idx]
     # Update batch indicator
     batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)

acestep/gradio_ui/interfaces/result.py CHANGED Viewed

@@ -28,7 +28,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_1 = gr.Audio(
                     label=t("results.generated_music", n=1),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
@@ -58,7 +59,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
@@ -88,7 +90,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_3 = gr.Button(
@@ -118,7 +121,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_4 = gr.Button(
@@ -151,7 +155,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_5 = gr.Audio(
                     label=t("results.generated_music", n=5),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -166,7 +171,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -181,7 +187,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
@@ -196,7 +203,8 @@ def create_results_section(dit_handler) -> dict:
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
                     type="filepath",
-                    interactive=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)

                 generated_audio_1 = gr.Audio(
                     label=t("results.generated_music", n=1),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_3 = gr.Button(
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_4 = gr.Button(
                 generated_audio_5 = gr.Audio(
                     label=t("results.generated_music", n=5),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
                     type="filepath",
+                    interactive=False,
+                    show_download_button=False
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)

acestep/handler.py CHANGED Viewed

@@ -10,6 +10,8 @@ import traceback
 import re
 import random
 import uuid
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
@@ -37,16 +39,12 @@ warnings.filterwarnings("ignore")
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
-    def __init__(self, save_root = None):
         self.model = None
         self.config = None
         self.device = "cpu"
         self.dtype = torch.float32  # Will be set based on device in initialize_service
-        if save_root is None:
-            self.temp_dir = tempfile.mkdtemp()
-        else:
-            self.temp_dir = save_root
         # VAE for audio encoding/decoding
         self.vae = None
@@ -81,8 +79,7 @@ class AceStepHandler:
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
         # Get project root (handler.py is in acestep/, so go up two levels to project root)
-        current_file = os.path.abspath(__file__)
-        project_root = os.path.dirname(os.path.dirname(current_file))
         # default checkpoints
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         if os.path.exists(checkpoint_dir):
@@ -93,8 +90,7 @@ class AceStepHandler:
     def get_available_acestep_v15_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-v15-'"""
         # Get project root
-        current_file = os.path.abspath(__file__)
-        project_root = os.path.dirname(os.path.dirname(current_file))
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         models = []
@@ -171,8 +167,7 @@ class AceStepHandler:
             # Auto-detect project root (independent of passed project_root parameter)
-            current_file = os.path.abspath(__file__)
-            actual_project_root = os.path.dirname(os.path.dirname(current_file))
             checkpoint_dir = os.path.join(actual_project_root, "checkpoints")
             # 1. Load main model
@@ -187,7 +182,7 @@ class AceStepHandler:
                     attn_implementation = "sdpa"
                 try:
-                    logger.info(f"Attempting to load model with attention implementation: {attn_implementation}")
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
@@ -195,9 +190,9 @@ class AceStepHandler:
                         dtype="bfloat16"
                     )
                 except Exception as e:
-                    logger.warning(f"Failed to load model with {attn_implementation}: {e}")
                     if attn_implementation == "sdpa":
-                        logger.info("Falling back to eager attention")
                         attn_implementation = "eager"
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
@@ -215,7 +210,7 @@ class AceStepHandler:
                 else:
                     # If offload_to_cpu is True, check if we should keep DiT on GPU
                     if not self.offload_dit_to_cpu:
-                        logger.info(f"Keeping main model on {device} (persistent)")
                         self.model = self.model.to(device).to(self.dtype)
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
@@ -239,7 +234,7 @@ class AceStepHandler:
                             raise ValueError(f"Unsupported quantization type: {self.quantization}")
                         quantize_(self.model, quant_config)
-                        logger.info(f"DiT quantized with: {self.quantization}")
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
@@ -260,7 +255,7 @@ class AceStepHandler:
             if os.path.exists(vae_checkpoint_path):
                 self.vae = AutoencoderOobleck.from_pretrained(vae_checkpoint_path)
                 # Use bfloat16 for VAE on GPU, otherwise use self.dtype (float32 on CPU)
-                vae_dtype = torch.bfloat16 if device in ["cuda", "xpu"] else self.dtype
                 if not self.offload_to_cpu:
                     self.vae = self.vae.to(device).to(vae_dtype)
                 else:
@@ -302,6 +297,7 @@ class AceStepHandler:
         except Exception as e:
             error_msg = f"❌ Error initializing model: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg, False
     @contextmanager
@@ -326,7 +322,7 @@ class AceStepHandler:
                 try:
                     param = next(model.parameters())
                     if param.device.type == "cpu":
-                        logger.info(f"Moving {model_name} to {self.device} (persistent)")
                         model.to(self.device).to(self.dtype)
                         if hasattr(self, "silence_latent"):
                             self.silence_latent = self.silence_latent.to(self.device).to(self.dtype)
@@ -341,10 +337,10 @@ class AceStepHandler:
             return
         # Load to GPU
-        logger.info(f"Loading {model_name} to {self.device}")
         start_time = time.time()
         if model_name == "vae":
-            vae_dtype = torch.bfloat16 if self.device in ["cuda", "xpu"] else self.dtype
             model.to(self.device).to(vae_dtype)
         else:
             model.to(self.device).to(self.dtype)
@@ -354,13 +350,13 @@ class AceStepHandler:
         load_time = time.time() - start_time
         self.current_offload_cost += load_time
-        logger.info(f"Loaded {model_name} to {self.device} in {load_time:.4f}s")
         try:
             yield
         finally:
             # Offload to CPU
-            logger.info(f"Offloading {model_name} to CPU")
             start_time = time.time()
             model.to("cpu")
@@ -370,7 +366,7 @@ class AceStepHandler:
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             self.current_offload_cost += offload_time
-            logger.info(f"Offloaded {model_name} to CPU in {offload_time:.4f}s")
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
@@ -386,23 +382,12 @@ class AceStepHandler:
             else:
                 audio = torch.from_numpy(audio_np.T)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            audio = audio[:2]
-            # Resample if needed
-            if sr != 48000:
-                import torch.nn.functional as F
-                ratio = 48000 / sr
-                new_length = int(audio.shape[-1] * ratio)
-                audio = F.interpolate(audio.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
-            audio = torch.clamp(audio, -1.0, 1.0)
             return audio
         except Exception as e:
-            logger.error(f"Error processing target audio: {e}")
             return None
     def _parse_audio_code_string(self, code_str: str) -> List[int]:
@@ -411,7 +396,8 @@ class AceStepHandler:
             return []
         try:
             return [int(x) for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str)]
-        except Exception:
             return []
     def _decode_audio_codes_to_latents(self, code_str: str) -> Optional[torch.Tensor]:
@@ -538,9 +524,7 @@ class AceStepHandler:
             )
         """
         # Align instruction formatting with _prepare_batch
-        final_instruction = instruction or DEFAULT_DIT_INSTRUCTION
-        if not final_instruction.endswith(":"):
-            final_instruction = final_instruction + ":"
         # Extract caption and language from metas if available (from LM CoT output)
         # Fallback to user-provided values if not in metas
@@ -571,7 +555,7 @@ class AceStepHandler:
         parsed_meta = self._parse_metas([metas])[0]
         caption_input = SFT_GEN_PROMPT.format(final_instruction, actual_caption, parsed_meta)
-        lyrics_input = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -614,7 +598,7 @@ class AceStepHandler:
                     return match.group(1).strip()
             return caption
         except Exception as e:
-            logger.error(f"Error extracting caption: {e}")
             return caption
     def prepare_seeds(self, actual_batch_size, seed, use_random_seed):
@@ -638,7 +622,8 @@ class AceStepHandler:
                     else:
                         try:
                             seed_list.append(int(float(s)))
-                        except (ValueError, TypeError):
                             seed_list.append(-1)
             elif seed is None or (isinstance(seed, (int, float)) and seed < 0):
                 # If seed is None or negative, use -1 for all items
@@ -679,7 +664,176 @@ class AceStepHandler:
         return actual_seed_list, seed_value_for_ui
     def prepare_metadata(self, bpm, key_scale, time_signature):
-        # Build metadata dict - use "N/A" as default for empty fields
         metadata_dict = {}
         if bpm:
             metadata_dict["bpm"] = bpm
@@ -695,10 +849,12 @@ class AceStepHandler:
             metadata_dict["timesignature"] = time_signature
         else:
             metadata_dict["timesignature"] = "N/A"
         return metadata_dict
-    def is_silence(self, audio):
-        return torch.all(audio.abs() < 1e-6)
     def generate_instruction(
         self,
@@ -745,23 +901,12 @@ class AceStepHandler:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
-            logger.info(f"Reference audio shape: {audio.shape}")
-            logger.info(f"Reference audio sample rate: {sr}")
-            logger.info(f"Reference audio duration: {audio.shape[-1] / 48000.0} seconds")
-            # Convert to stereo (duplicate channel if mono)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            # Keep only first 2 channels
-            audio = audio[:2]
-            # Resample to 48kHz if needed
-            if sr != 48000:
-                audio = torchaudio.transforms.Resample(sr, 48000)(audio)
-            # Clamp values to [-1.0, 1.0]
-            audio = torch.clamp(audio, -1.0, 1.0)
             is_silence = self.is_silence(audio)
             if is_silence:
@@ -800,7 +945,7 @@ class AceStepHandler:
             return audio
         except Exception as e:
-            logger.error(f"Error processing reference audio: {e}")
             return None
     def process_src_audio(self, audio_file) -> Optional[torch.Tensor]:
@@ -811,24 +956,13 @@ class AceStepHandler:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
-            # Convert to stereo (duplicate channel if mono)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            # Keep only first 2 channels
-            audio = audio[:2]
-            # Resample to 48kHz if needed
-            if sr != 48000:
-                audio = torchaudio.transforms.Resample(sr, 48000)(audio)
-            # Clamp values to [-1.0, 1.0]
-            audio = torch.clamp(audio, -1.0, 1.0)
             return audio
         except Exception as e:
-            logger.error(f"Error processing target audio: {e}")
             return None
     def convert_src_audio_to_codes(self, audio_file) -> str:
@@ -856,19 +990,12 @@ class AceStepHandler:
             # Encode audio to latents using VAE
             with torch.no_grad():
                 with self._load_model_context("vae"):
-                    # Prepare audio for VAE: [channels, samples] -> [1, channels, samples]
-                    vae_input = processed_audio.unsqueeze(0).to(self.device).to(self.vae.dtype)
                     # Check if audio is silence
-                    if self.is_silence(vae_input):
                         return "❌ Audio file appears to be silent"
-                    # Encode to latents
-                    latents = self.vae.encode(vae_input).latent_dist.sample()
-                    # Cast back to model dtype
-                    latents = latents.to(self.dtype)
-                    # Transpose: [1, d, T] -> [1, T, d] -> [T, d]
-                    latents = latents.squeeze(0).transpose(0, 1)  # [T, d]
                 # Create attention mask for latents
                 attention_mask = torch.ones(latents.shape[0], dtype=torch.bool, device=self.device)
@@ -893,7 +1020,7 @@ class AceStepHandler:
         except Exception as e:
             error_msg = f"❌ Error converting audio to codes: {str(e)}\n{traceback.format_exc()}"
-            logger.error(error_msg)
             return error_msg
     def prepare_batch_data(
@@ -922,26 +1049,7 @@ class AceStepHandler:
             calculated_duration = audio_duration
         # Build metadata dict - use "N/A" as default for empty fields
-        metadata_dict = {}
-        if bpm:
-            metadata_dict["bpm"] = bpm
-        else:
-            metadata_dict["bpm"] = "N/A"
-        if key_scale.strip():
-            metadata_dict["keyscale"] = key_scale
-        else:
-            metadata_dict["keyscale"] = "N/A"
-        if time_signature.strip() and time_signature != "N/A" and time_signature:
-            metadata_dict["timesignature"] = time_signature
-        else:
-            metadata_dict["timesignature"] = "N/A"
-        # Add duration to metadata if available (inference service format: "30 seconds")
-        if calculated_duration is not None:
-            metadata_dict["duration"] = f"{int(calculated_duration)} seconds"
-        # If duration not set, inference service will use default (30 seconds)
         # Format metadata - inference service accepts dict and will convert to string
         # Create a copy for each batch item (in case we modify it)
@@ -977,7 +1085,7 @@ class AceStepHandler:
             target_wavs = torch.zeros(2, frames)
             return target_wavs
         except Exception as e:
-            logger.error(f"Error creating target audio: {e}")
             # Fallback to 30 seconds if error
             return torch.zeros(2, 30 * 48000)
@@ -1158,16 +1266,8 @@ class AceStepHandler:
         """
         batch_size = len(captions)
-        # Ensure audio_code_hints is a list of the correct length
-        if audio_code_hints is None:
-            audio_code_hints = [None] * batch_size
-        elif len(audio_code_hints) != batch_size:
-            if len(audio_code_hints) == 1:
-                audio_code_hints = audio_code_hints * batch_size
-            else:
-                audio_code_hints = audio_code_hints[:batch_size]
-                while len(audio_code_hints) < batch_size:
-                    audio_code_hints.append(None)
         for ii, refer_audio_list in enumerate(refer_audios):
             if isinstance(refer_audio_list, list):
@@ -1179,17 +1279,6 @@ class AceStepHandler:
         if vocal_languages is None:
             vocal_languages = self._create_fallback_vocal_languages(batch_size)
-        # Normalize audio_code_hints to batch list
-        if audio_code_hints is None:
-            audio_code_hints = [None] * batch_size
-        elif not isinstance(audio_code_hints, list):
-            audio_code_hints = [audio_code_hints] * batch_size
-        elif len(audio_code_hints) == 1 and batch_size > 1:
-            audio_code_hints = audio_code_hints * batch_size
-        else:
-            audio_code_hints = (audio_code_hints + [None] * batch_size)[:batch_size]
-        audio_code_hints = [hint if isinstance(hint, str) and hint.strip() else None for hint in audio_code_hints]
         # Parse metas with fallbacks
         parsed_metas = self._parse_metas(metas)
@@ -1223,13 +1312,9 @@ class AceStepHandler:
                         expected_latent_length = current_wav.shape[-1] // 1920
                         target_latent = self.silence_latent[0, :expected_latent_length, :]
                     else:
-                        # Ensure input is in VAE's dtype
                         logger.info(f"[generate_music] Encoding target audio to latents for item {i}...")
-                        vae_input = current_wav.to(self.device).to(self.vae.dtype)
-                        target_latent = self.vae.encode(vae_input).latent_dist.sample()
-                        # Cast back to model dtype
-                        target_latent = target_latent.to(self.dtype)
-                        target_latent = target_latent.squeeze(0).transpose(0, 1)
                     target_latents_list.append(target_latent)
                     latent_lengths.append(target_latent.shape[0])
@@ -1268,18 +1353,7 @@ class AceStepHandler:
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
-        if instructions is None:
-            instructions = [DEFAULT_DIT_INSTRUCTION] * batch_size
-        # Ensure instructions list has the same length as batch_size
-        if len(instructions) != batch_size:
-            if len(instructions) == 1:
-                instructions = instructions * batch_size
-            else:
-                # Pad or truncate to match batch_size
-                instructions = instructions[:batch_size]
-                while len(instructions) < batch_size:
-                    instructions.append(DEFAULT_DIT_INSTRUCTION)
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
@@ -1428,6 +1502,10 @@ class AceStepHandler:
         else:
             precomputed_lm_hints_25Hz = None
         # Format text_inputs
         text_inputs = []
         text_token_idss = []
@@ -1437,26 +1515,10 @@ class AceStepHandler:
         for i in range(batch_size):
             # Use custom instruction for this batch item
-            instruction = instructions[i] if i < len(instructions) else DEFAULT_DIT_INSTRUCTION
-            # Ensure instruction ends with ":"
-            if not instruction.endswith(":"):
-                instruction = instruction + ":"
-            # Extract caption and language from metas if available (from LM CoT output)
-            # Fallback to user-provided values if not in metas
-            actual_caption = captions[i]
-            actual_language = vocal_languages[i]
-            # Check if metas contains caption/language from LM CoT
-            if i < len(parsed_metas) and parsed_metas[i]:
-                meta_dict = parsed_metas[i]
-                if isinstance(meta_dict, dict):
-                    # Extract caption from metas if available
-                    if 'caption' in meta_dict and meta_dict['caption']:
-                        actual_caption = str(meta_dict['caption'])
-                    # Extract language from metas if available
-                    if 'language' in meta_dict and meta_dict['language']:
-                        actual_language = str(meta_dict['language'])
             # Format text prompt with custom instruction (using LM-generated caption if available)
             text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
@@ -1473,7 +1535,7 @@ class AceStepHandler:
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
             # Format and tokenize lyrics (using LM-generated language if available)
-            lyrics_text = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics[i]}<|endoftext|>"
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
@@ -1495,36 +1557,12 @@ class AceStepHandler:
         # Pad tokenized sequences
         max_text_length = max(len(seq) for seq in text_token_idss)
-        padded_text_token_idss = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_text_length - len(seq)), 'constant',
-                self.text_tokenizer.pad_token_id
-            )
-            for seq in text_token_idss
-        ])
-        padded_text_attention_masks = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_text_length - len(seq)), 'constant', 0
-            )
-            for seq in text_attention_masks
-        ])
         max_lyric_length = max(len(seq) for seq in lyric_token_idss)
-        padded_lyric_token_idss = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_lyric_length - len(seq)), 'constant',
-                self.text_tokenizer.pad_token_id
-            )
-            for seq in lyric_token_idss
-        ])
-        padded_lyric_attention_masks = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_lyric_length - len(seq)), 'constant', 0
-            )
-            for seq in lyric_attention_masks
-        ])
         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
@@ -1533,14 +1571,10 @@ class AceStepHandler:
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
-                instruction = DEFAULT_DIT_INSTRUCTION
                 # Extract caption from metas if available (from LM CoT output)
-                actual_caption = captions[i]
-                if i < len(parsed_metas) and parsed_metas[i]:
-                    meta_dict = parsed_metas[i]
-                    if isinstance(meta_dict, dict) and 'caption' in meta_dict and meta_dict['caption']:
-                        actual_caption = str(meta_dict['caption'])
                 # Format text prompt with custom instruction (using LM-generated caption if available)
                 text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
@@ -1558,19 +1592,8 @@ class AceStepHandler:
                 non_cover_text_input_ids.append(text_token_ids)
                 non_cover_text_attention_masks.append(non_cover_text_attention_mask)
-            padded_non_cover_text_input_ids = torch.stack([
-                torch.nn.functional.pad(
-                    seq, (0, max_text_length - len(seq)), 'constant',
-                    self.text_tokenizer.pad_token_id
-                )
-                for seq in non_cover_text_input_ids
-            ])
-            padded_non_cover_text_attention_masks = torch.stack([
-                torch.nn.functional.pad(
-                    seq, (0, max_text_length - len(seq)), 'constant', 0
-                )
-                for seq in non_cover_text_attention_masks
-            ])
         if audio_cover_strength < 1.0:
             assert padded_non_cover_text_input_ids is not None, "When audio_cover_strength < 1.0, padded_non_cover_text_input_ids must not be None"
@@ -1804,7 +1827,7 @@ class AceStepHandler:
         if self.config.is_turbo:
             # Limit inference steps to maximum 8
             if infer_steps > 8:
-                logger.warning(f"dmd_gan version: infer_steps {infer_steps} exceeds maximum 8, clamping to 8")
                 infer_steps = 8
             # CFG parameters are not adjustable for dmd_gan (they will be ignored)
             # Note: guidance_scale, cfg_interval_start, cfg_interval_end are still passed but may be ignored by the model
@@ -1827,30 +1850,12 @@ class AceStepHandler:
         if isinstance(repainting_end, (int, float)):
             repainting_end = [repainting_end]
-        # Convert instructions to list
-        if isinstance(instructions, str):
-            instructions = [instructions]
-        elif instructions is None:
-            instructions = None
-        # Convert audio_code_hints to list
-        if isinstance(audio_code_hints, str):
-            audio_code_hints = [audio_code_hints]
-        elif audio_code_hints is None:
-            audio_code_hints = None
         # Get batch size from captions
         batch_size = len(captions)
-        # Ensure audio_code_hints matches batch size
-        if audio_code_hints is not None:
-            if len(audio_code_hints) != batch_size:
-                if len(audio_code_hints) == 1:
-                    audio_code_hints = audio_code_hints * batch_size
-                else:
-                    audio_code_hints = audio_code_hints[:batch_size]
-                    while len(audio_code_hints) < batch_size:
-                        audio_code_hints.append(None)
         # Convert seed to list format
         if seed is None:
@@ -1947,6 +1952,14 @@ class AceStepHandler:
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             outputs = self.model.generate_audio(**generate_kwargs)
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
@@ -2042,25 +2055,33 @@ class AceStepHandler:
         use_adg: bool = False,
         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
-        audio_format: str = "mp3",
-        lm_temperature: float = 0.6,
         use_tiled_decode: bool = True,
         progress=None
-    ) -> Tuple[Optional[str], Optional[str], List[str], str, str, str, str, str, Optional[Any], str, str, Optional[Any]]:
         """
         Main interface for music generation
         Returns:
-            (first_audio, second_audio, all_audio_paths, generation_info, status_message,
-             seed_value_for_ui, align_score_1, align_text_1, align_plot_1,
-             align_score_2, align_text_2, align_plot_2)
         """
         if progress is None:
             def progress(*args, **kwargs):
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
-            return None, None, [], "", "❌ Model not fully initialized. Please initialize all components first.", "-1", "", "", None, "", "", None
         def _has_audio_codes(v: Union[str, List[str]]) -> bool:
             if isinstance(v, list):
@@ -2079,7 +2100,7 @@ class AceStepHandler:
         logger.info("[generate_music] Starting generation...")
         if progress:
-            progress(0.05, desc="Preparing inputs...")
         logger.info("[generate_music] Preparing inputs...")
         # Reset offload cost
@@ -2101,8 +2122,6 @@ class AceStepHandler:
             repainting_end = None
         try:
-            progress(0.1, desc="Preparing inputs...")
             # 1. Process reference audio
             refer_audios = None
             if reference_audio is not None:
@@ -2154,7 +2173,7 @@ class AceStepHandler:
                 can_use_repainting
             )
-            progress(0.3, desc=f"Generating music (batch size: {actual_batch_size})...")
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
@@ -2191,8 +2210,8 @@ class AceStepHandler:
             pred_latents = outputs["target_latents"]  # [batch, latent_length, latent_dim]
             time_costs = outputs["time_costs"]
             time_costs["offload_time_cost"] = self.current_offload_cost
-            logger.info(f"  - pred_latents: {pred_latents.shape}, dtype={pred_latents.dtype} {pred_latents.min()=}, {pred_latents.max()=}, {pred_latents.mean()=} {pred_latents.std()=}")
-            logger.info(f"  - time_costs: {time_costs}")
             if progress:
                 progress(0.8, desc="Decoding audio...")
             logger.info("[generate_music] Decoding latents with VAE...")
@@ -2221,75 +2240,66 @@ class AceStepHandler:
             # Update offload cost one last time to include VAE offloading
             time_costs["offload_time_cost"] = self.current_offload_cost
-            logger.info("[generate_music] VAE decode completed. Saving audio files...")
             if progress:
-                progress(0.9, desc="Saving audio files...")
-            # Save audio files using soundfile (supports wav, flac, mp3 via format param)
-            audio_format_lower = audio_format.lower() if audio_format else "wav"
-            if audio_format_lower not in ["wav", "flac", "mp3"]:
-                audio_format_lower = "wav"
-            saved_files = []
-            saved_uuids = []  # Store UUIDs for each file
             for i in range(actual_batch_size):
-                # Generate unique UUID for each audio file
-                file_uuid = str(uuid.uuid4())
-                audio_file = os.path.join(self.temp_dir, f"{file_uuid}.{audio_format_lower}")
-                # Convert to numpy: [channels, samples] -> [samples, channels]
-                audio_np = pred_wavs[i].cpu().float().numpy().T
-                sf.write(audio_file, audio_np, self.sample_rate)
-                saved_files.append(audio_file)
-                saved_uuids.append(file_uuid)
-            # Prepare return values
-            first_audio = saved_files[0] if len(saved_files) > 0 else None
-            second_audio = saved_files[1] if len(saved_files) > 1 else None
-            # Format time costs if available
-            time_costs_str = ""
-            if time_costs:
-                if isinstance(time_costs, dict):
-                    time_costs_str = "\n\n**⏱️ Time Costs:**\n"
-                    for key, value in time_costs.items():
-                        # Format key: encoder_time_cost -> Encoder
-                        formatted_key = key.replace("_time_cost", "").replace("_", " ").title()
-                        time_costs_str += f"  - {formatted_key}: {value:.2f}s\n"
-                elif isinstance(time_costs, (int, float)):
-                    time_costs_str = f"\n\n**⏱️ Time Cost:** {time_costs:.2f}s"
-            generation_info = f"""**🎵 Generation Complete**
-    **Seeds:** {seed_value_for_ui}
-    **Steps:** {inference_steps}
-    **Files:** {len(saved_files)} audio(s){time_costs_str}"""
             status_message = f"✅ Generation completed successfully!"
-            logger.info(f"[generate_music] Done! Generated {len(saved_files)} audio files.")
-            # Alignment scores and plots (placeholder for now)
-            align_score_1 = ""
-            align_text_1 = ""
-            align_plot_1 = None
-            align_score_2 = ""
-            align_text_2 = ""
-            align_plot_2 = None
-            return (
-                first_audio,
-                second_audio,
-                saved_files,
-                generation_info,
-                status_message,
-                seed_value_for_ui,
-                align_score_1,
-                align_text_1,
-                align_plot_1,
-                align_score_2,
-                align_text_2,
-                align_plot_2,
-            )
         except Exception as e:
             error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
-            return None, None, [], "", error_msg, seed_value_for_ui, "", "", None, "", "", None

 import re
 import random
 import uuid
+import hashlib
+import json
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
+    def __init__(self):
         self.model = None
         self.config = None
         self.device = "cpu"
         self.dtype = torch.float32  # Will be set based on device in initialize_service
         # VAE for audio encoding/decoding
         self.vae = None
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
         # Get project root (handler.py is in acestep/, so go up two levels to project root)
+        project_root = self._get_project_root()
         # default checkpoints
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         if os.path.exists(checkpoint_dir):
     def get_available_acestep_v15_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-v15-'"""
         # Get project root
+        project_root = self._get_project_root()
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         models = []
             # Auto-detect project root (independent of passed project_root parameter)
+            actual_project_root = self._get_project_root()
             checkpoint_dir = os.path.join(actual_project_root, "checkpoints")
             # 1. Load main model
                     attn_implementation = "sdpa"
                 try:
+                    logger.info(f"[initialize_service] Attempting to load model with attention implementation: {attn_implementation}")
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
                         dtype="bfloat16"
                     )
                 except Exception as e:
+                    logger.warning(f"[initialize_service] Failed to load model with {attn_implementation}: {e}")
                     if attn_implementation == "sdpa":
+                        logger.info("[initialize_service] Falling back to eager attention")
                         attn_implementation = "eager"
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
                 else:
                     # If offload_to_cpu is True, check if we should keep DiT on GPU
                     if not self.offload_dit_to_cpu:
+                        logger.info(f"[initialize_service] Keeping main model on {device} (persistent)")
                         self.model = self.model.to(device).to(self.dtype)
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
                             raise ValueError(f"Unsupported quantization type: {self.quantization}")
                         quantize_(self.model, quant_config)
+                        logger.info(f"[initialize_service] DiT quantized with: {self.quantization}")
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
             if os.path.exists(vae_checkpoint_path):
                 self.vae = AutoencoderOobleck.from_pretrained(vae_checkpoint_path)
                 # Use bfloat16 for VAE on GPU, otherwise use self.dtype (float32 on CPU)
+                vae_dtype = self._get_vae_dtype(device)
                 if not self.offload_to_cpu:
                     self.vae = self.vae.to(device).to(vae_dtype)
                 else:
         except Exception as e:
             error_msg = f"❌ Error initializing model: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            logger.exception("[initialize_service] Error initializing model")
             return error_msg, False
     @contextmanager
                 try:
                     param = next(model.parameters())
                     if param.device.type == "cpu":
+                        logger.info(f"[_load_model_context] Moving {model_name} to {self.device} (persistent)")
                         model.to(self.device).to(self.dtype)
                         if hasattr(self, "silence_latent"):
                             self.silence_latent = self.silence_latent.to(self.device).to(self.dtype)
             return
         # Load to GPU
+        logger.info(f"[_load_model_context] Loading {model_name} to {self.device}")
         start_time = time.time()
         if model_name == "vae":
+            vae_dtype = self._get_vae_dtype()
             model.to(self.device).to(vae_dtype)
         else:
             model.to(self.device).to(self.dtype)
         load_time = time.time() - start_time
         self.current_offload_cost += load_time
+        logger.info(f"[_load_model_context] Loaded {model_name} to {self.device} in {load_time:.4f}s")
         try:
             yield
         finally:
             # Offload to CPU
+            logger.info(f"[_load_model_context] Offloading {model_name} to CPU")
             start_time = time.time()
             model.to("cpu")
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             self.current_offload_cost += offload_time
+            logger.info(f"[_load_model_context] Offloaded {model_name} to CPU in {offload_time:.4f}s")
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
             else:
                 audio = torch.from_numpy(audio_np.T)
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             return audio
         except Exception as e:
+            logger.exception("[process_target_audio] Error processing target audio")
             return None
     def _parse_audio_code_string(self, code_str: str) -> List[int]:
             return []
         try:
             return [int(x) for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str)]
+        except Exception as e:
+            logger.debug(f"[_parse_audio_code_string] Failed to parse audio code string: {e}")
             return []
     def _decode_audio_codes_to_latents(self, code_str: str) -> Optional[torch.Tensor]:
             )
         """
         # Align instruction formatting with _prepare_batch
+        final_instruction = self._format_instruction(instruction or DEFAULT_DIT_INSTRUCTION)
         # Extract caption and language from metas if available (from LM CoT output)
         # Fallback to user-provided values if not in metas
         parsed_meta = self._parse_metas([metas])[0]
         caption_input = SFT_GEN_PROMPT.format(final_instruction, actual_caption, parsed_meta)
+        lyrics_input = self._format_lyrics(lyrics, actual_language)
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
                     return match.group(1).strip()
             return caption
         except Exception as e:
+            logger.exception("[extract_caption_from_sft_format] Error extracting caption")
             return caption
     def prepare_seeds(self, actual_batch_size, seed, use_random_seed):
                     else:
                         try:
                             seed_list.append(int(float(s)))
+                        except (ValueError, TypeError) as e:
+                            logger.debug(f"[prepare_seeds] Failed to parse seed value '{s}': {e}")
                             seed_list.append(-1)
             elif seed is None or (isinstance(seed, (int, float)) and seed < 0):
                 # If seed is None or negative, use -1 for all items
         return actual_seed_list, seed_value_for_ui
     def prepare_metadata(self, bpm, key_scale, time_signature):
+        """Build metadata dict - use "N/A" as default for empty fields."""
+        return self._build_metadata_dict(bpm, key_scale, time_signature)
+    def is_silence(self, audio):
+        return torch.all(audio.abs() < 1e-6)
+    def _get_project_root(self) -> str:
+        """Get project root directory path."""
+        current_file = os.path.abspath(__file__)
+        return os.path.dirname(os.path.dirname(current_file))
+    def _get_vae_dtype(self, device: Optional[str] = None) -> torch.dtype:
+        """Get VAE dtype based on device."""
+        device = device or self.device
+        return torch.bfloat16 if device in ["cuda", "xpu"] else self.dtype
+    def _format_instruction(self, instruction: str) -> str:
+        """Format instruction to ensure it ends with colon."""
+        if not instruction.endswith(":"):
+            instruction = instruction + ":"
+        return instruction
+    def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
+        """
+        Normalize audio to stereo 48kHz format.
+        Args:
+            audio: Audio tensor [channels, samples] or [samples]
+            sr: Sample rate
+        Returns:
+            Normalized audio tensor [2, samples] at 48kHz
+        """
+        # Convert to stereo (duplicate channel if mono)
+        if audio.shape[0] == 1:
+            audio = torch.cat([audio, audio], dim=0)
+        # Keep only first 2 channels
+        audio = audio[:2]
+        # Resample to 48kHz if needed
+        if sr != 48000:
+            audio = torchaudio.transforms.Resample(sr, 48000)(audio)
+        # Clamp values to [-1.0, 1.0]
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio
+    def _normalize_audio_code_hints(self, audio_code_hints: Optional[Union[str, List[str]]], batch_size: int) -> List[Optional[str]]:
+        """Normalize audio_code_hints to list of correct length."""
+        if audio_code_hints is None:
+            normalized = [None] * batch_size
+        elif isinstance(audio_code_hints, str):
+            normalized = [audio_code_hints] * batch_size
+        elif len(audio_code_hints) == 1 and batch_size > 1:
+            normalized = audio_code_hints * batch_size
+        elif len(audio_code_hints) != batch_size:
+            # Pad or truncate to match batch_size
+            normalized = list(audio_code_hints[:batch_size])
+            while len(normalized) < batch_size:
+                normalized.append(None)
+        else:
+            normalized = list(audio_code_hints)
+        # Clean up: convert empty strings to None
+        normalized = [hint if isinstance(hint, str) and hint.strip() else None for hint in normalized]
+        return normalized
+    def _normalize_instructions(self, instructions: Optional[Union[str, List[str]]], batch_size: int, default: Optional[str] = None) -> List[str]:
+        """Normalize instructions to list of correct length."""
+        if instructions is None:
+            default_instruction = default or DEFAULT_DIT_INSTRUCTION
+            return [default_instruction] * batch_size
+        elif isinstance(instructions, str):
+            return [instructions] * batch_size
+        elif len(instructions) == 1:
+            return instructions * batch_size
+        elif len(instructions) != batch_size:
+            # Pad or truncate to match batch_size
+            normalized = list(instructions[:batch_size])
+            default_instruction = default or DEFAULT_DIT_INSTRUCTION
+            while len(normalized) < batch_size:
+                normalized.append(default_instruction)
+            return normalized
+        else:
+            return list(instructions)
+    def _format_lyrics(self, lyrics: str, language: str) -> str:
+        """Format lyrics text with language header."""
+        return f"# Languages\n{language}\n\n# Lyric\n{lyrics}<|endoftext|>"
+    def _pad_sequences(self, sequences: List[torch.Tensor], max_length: int, pad_value: int = 0) -> torch.Tensor:
+        """Pad sequences to same length."""
+        return torch.stack([
+            torch.nn.functional.pad(seq, (0, max_length - len(seq)), 'constant', pad_value)
+            for seq in sequences
+        ])
+    def _extract_caption_and_language(self, metas: List[Union[str, Dict[str, Any]]], captions: List[str], vocal_languages: List[str]) -> Tuple[List[str], List[str]]:
+        """Extract caption and language from metas with fallback to provided values."""
+        actual_captions = list(captions)
+        actual_languages = list(vocal_languages)
+        for i, meta in enumerate(metas):
+            if i >= len(actual_captions):
+                break
+            meta_dict = None
+            if isinstance(meta, str):
+                parsed = self._parse_metas([meta])
+                if parsed and isinstance(parsed[0], dict):
+                    meta_dict = parsed[0]
+            elif isinstance(meta, dict):
+                meta_dict = meta
+            if meta_dict:
+                if 'caption' in meta_dict and meta_dict['caption']:
+                    actual_captions[i] = str(meta_dict['caption'])
+                if 'language' in meta_dict and meta_dict['language']:
+                    actual_languages[i] = str(meta_dict['language'])
+        return actual_captions, actual_languages
+    def _encode_audio_to_latents(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio to latents using VAE.
+        Args:
+            audio: Audio tensor [channels, samples] or [batch, channels, samples]
+        Returns:
+            Latents tensor [T, D] or [batch, T, D]
+        """
+        # Ensure batch dimension
+        if audio.dim() == 2:
+            audio = audio.unsqueeze(0)
+        # Ensure input is in VAE's dtype
+        vae_input = audio.to(self.device).to(self.vae.dtype)
+        # Encode to latents
+        with torch.no_grad():
+            latents = self.vae.encode(vae_input).latent_dist.sample()
+        # Cast back to model dtype
+        latents = latents.to(self.dtype)
+        # Transpose: [batch, d, T] -> [batch, T, d]
+        latents = latents.transpose(1, 2)
+        # Remove batch dimension if input didn't have it
+        if audio.dim() == 2:
+            latents = latents.squeeze(0)
+        return latents
+    def _build_metadata_dict(self, bpm: Optional[Union[int, str]], key_scale: str, time_signature: str, duration: Optional[float] = None) -> Dict[str, Any]:
+        """
+        Build metadata dictionary with default values.
+        Args:
+            bpm: BPM value (optional)
+            key_scale: Key/scale string
+            time_signature: Time signature string
+            duration: Duration in seconds (optional)
+        Returns:
+            Metadata dictionary
+        """
         metadata_dict = {}
         if bpm:
             metadata_dict["bpm"] = bpm
             metadata_dict["timesignature"] = time_signature
         else:
             metadata_dict["timesignature"] = "N/A"
+        # Add duration if provided
+        if duration is not None:
+            metadata_dict["duration"] = f"{int(duration)} seconds"
         return metadata_dict
     def generate_instruction(
         self,
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
+            logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
+            logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
+            logger.debug(f"[process_reference_audio] Reference audio duration: {audio.shape[-1] / 48000.0} seconds")
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             is_silence = self.is_silence(audio)
             if is_silence:
             return audio
         except Exception as e:
+            logger.exception("[process_reference_audio] Error processing reference audio")
             return None
     def process_src_audio(self, audio_file) -> Optional[torch.Tensor]:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             return audio
         except Exception as e:
+            logger.exception("[process_src_audio] Error processing source audio")
             return None
     def convert_src_audio_to_codes(self, audio_file) -> str:
             # Encode audio to latents using VAE
             with torch.no_grad():
                 with self._load_model_context("vae"):
                     # Check if audio is silence
+                    if self.is_silence(processed_audio.unsqueeze(0)):
                         return "❌ Audio file appears to be silent"
+                    # Encode to latents using helper method
+                    latents = self._encode_audio_to_latents(processed_audio)  # [T, d]
                 # Create attention mask for latents
                 attention_mask = torch.ones(latents.shape[0], dtype=torch.bool, device=self.device)
         except Exception as e:
             error_msg = f"❌ Error converting audio to codes: {str(e)}\n{traceback.format_exc()}"
+            logger.exception("[convert_src_audio_to_codes] Error converting audio to codes")
             return error_msg
     def prepare_batch_data(
             calculated_duration = audio_duration
         # Build metadata dict - use "N/A" as default for empty fields
+        metadata_dict = self._build_metadata_dict(bpm, key_scale, time_signature, calculated_duration)
         # Format metadata - inference service accepts dict and will convert to string
         # Create a copy for each batch item (in case we modify it)
             target_wavs = torch.zeros(2, frames)
             return target_wavs
         except Exception as e:
+            logger.exception("[create_target_wavs] Error creating target audio")
             # Fallback to 30 seconds if error
             return torch.zeros(2, 30 * 48000)
         """
         batch_size = len(captions)
+        # Normalize audio_code_hints to batch list
+        audio_code_hints = self._normalize_audio_code_hints(audio_code_hints, batch_size)
         for ii, refer_audio_list in enumerate(refer_audios):
             if isinstance(refer_audio_list, list):
         if vocal_languages is None:
             vocal_languages = self._create_fallback_vocal_languages(batch_size)
         # Parse metas with fallbacks
         parsed_metas = self._parse_metas(metas)
                         expected_latent_length = current_wav.shape[-1] // 1920
                         target_latent = self.silence_latent[0, :expected_latent_length, :]
                     else:
+                        # Encode using helper method
                         logger.info(f"[generate_music] Encoding target audio to latents for item {i}...")
+                        target_latent = self._encode_audio_to_latents(current_wav.squeeze(0))  # Remove batch dim for helper
                     target_latents_list.append(target_latent)
                     latent_lengths.append(target_latent.shape[0])
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
+        instructions = self._normalize_instructions(instructions, batch_size, DEFAULT_DIT_INSTRUCTION)
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
         else:
             precomputed_lm_hints_25Hz = None
+        # Extract caption and language from metas if available (from LM CoT output)
+        # Fallback to user-provided values if not in metas
+        actual_captions, actual_languages = self._extract_caption_and_language(parsed_metas, captions, vocal_languages)
         # Format text_inputs
         text_inputs = []
         text_token_idss = []
         for i in range(batch_size):
             # Use custom instruction for this batch item
+            instruction = self._format_instruction(instructions[i] if i < len(instructions) else DEFAULT_DIT_INSTRUCTION)
+            actual_caption = actual_captions[i]
+            actual_language = actual_languages[i]
             # Format text prompt with custom instruction (using LM-generated caption if available)
             text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
             # Format and tokenize lyrics (using LM-generated language if available)
+            lyrics_text = self._format_lyrics(lyrics[i], actual_language)
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
         # Pad tokenized sequences
         max_text_length = max(len(seq) for seq in text_token_idss)
+        padded_text_token_idss = self._pad_sequences(text_token_idss, max_text_length, self.text_tokenizer.pad_token_id)
+        padded_text_attention_masks = self._pad_sequences(text_attention_masks, max_text_length, 0)
         max_lyric_length = max(len(seq) for seq in lyric_token_idss)
+        padded_lyric_token_idss = self._pad_sequences(lyric_token_idss, max_lyric_length, self.text_tokenizer.pad_token_id)
+        padded_lyric_attention_masks = self._pad_sequences(lyric_attention_masks, max_lyric_length, 0)
         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
+                instruction = self._format_instruction(DEFAULT_DIT_INSTRUCTION)
                 # Extract caption from metas if available (from LM CoT output)
+                actual_caption = actual_captions[i]
                 # Format text prompt with custom instruction (using LM-generated caption if available)
                 text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
                 non_cover_text_input_ids.append(text_token_ids)
                 non_cover_text_attention_masks.append(non_cover_text_attention_mask)
+            padded_non_cover_text_input_ids = self._pad_sequences(non_cover_text_input_ids, max_text_length, self.text_tokenizer.pad_token_id)
+            padded_non_cover_text_attention_masks = self._pad_sequences(non_cover_text_attention_masks, max_text_length, 0)
         if audio_cover_strength < 1.0:
             assert padded_non_cover_text_input_ids is not None, "When audio_cover_strength < 1.0, padded_non_cover_text_input_ids must not be None"
         if self.config.is_turbo:
             # Limit inference steps to maximum 8
             if infer_steps > 8:
+                logger.warning(f"[service_generate] dmd_gan version: infer_steps {infer_steps} exceeds maximum 8, clamping to 8")
                 infer_steps = 8
             # CFG parameters are not adjustable for dmd_gan (they will be ignored)
             # Note: guidance_scale, cfg_interval_start, cfg_interval_end are still passed but may be ignored by the model
         if isinstance(repainting_end, (int, float)):
             repainting_end = [repainting_end]
         # Get batch size from captions
         batch_size = len(captions)
+        # Normalize instructions and audio_code_hints to match batch size
+        instructions = self._normalize_instructions(instructions, batch_size, DEFAULT_DIT_INSTRUCTION) if instructions is not None else None
+        audio_code_hints = self._normalize_audio_code_hints(audio_code_hints, batch_size) if audio_code_hints is not None else None
         # Convert seed to list format
         if seed is None:
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             outputs = self.model.generate_audio(**generate_kwargs)
+        # Add intermediate information to outputs for extra_outputs
+        outputs["src_latents"] = src_latents
+        outputs["target_latents_input"] = target_latents  # Input target latents (before generation)
+        outputs["chunk_masks"] = chunk_mask
+        outputs["spans"] = spans
+        outputs["latent_masks"] = batch.get("latent_masks")  # Latent masks for valid length
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
         use_adg: bool = False,
         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
         use_tiled_decode: bool = True,
         progress=None
+    ) -> Dict[str, Any]:
         """
         Main interface for music generation
         Returns:
+            Dictionary containing:
+            - audios: List of audio dictionaries with path, key, params
+            - generation_info: Markdown-formatted generation information
+            - status_message: Status message
+            - extra_outputs: Dictionary with latents, masks, time_costs, etc.
+            - success: Whether generation completed successfully
+            - error: Error message if generation failed
         """
         if progress is None:
             def progress(*args, **kwargs):
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
+            return {
+                "audios": [],
+                "status_message": "❌ Model not fully initialized. Please initialize all components first.",
+                "extra_outputs": {},
+                "success": False,
+                "error": "Model not fully initialized",
+            }
         def _has_audio_codes(v: Union[str, List[str]]) -> bool:
             if isinstance(v, list):
         logger.info("[generate_music] Starting generation...")
         if progress:
+            progress(0.51, desc="Preparing inputs...")
         logger.info("[generate_music] Preparing inputs...")
         # Reset offload cost
             repainting_end = None
         try:
             # 1. Process reference audio
             refer_audios = None
             if reference_audio is not None:
                 can_use_repainting
             )
+            progress(0.52, desc=f"Generating music (batch size: {actual_batch_size})...")
             # Prepare audio_code_hints - use if audio_code_string is provided
             # This works for both text2music (auto-switched to cover) and cover tasks
             pred_latents = outputs["target_latents"]  # [batch, latent_length, latent_dim]
             time_costs = outputs["time_costs"]
             time_costs["offload_time_cost"] = self.current_offload_cost
+            logger.debug(f"[generate_music] pred_latents: {pred_latents.shape}, dtype={pred_latents.dtype} {pred_latents.min()=}, {pred_latents.max()=}, {pred_latents.mean()=} {pred_latents.std()=}")
+            logger.debug(f"[generate_music] time_costs: {time_costs}")
             if progress:
                 progress(0.8, desc="Decoding audio...")
             logger.info("[generate_music] Decoding latents with VAE...")
             # Update offload cost one last time to include VAE offloading
             time_costs["offload_time_cost"] = self.current_offload_cost
+            logger.info("[generate_music] VAE decode completed. Preparing audio tensors...")
             if progress:
+                progress(0.99, desc="Preparing audio data...")
+            # Prepare audio tensors (no file I/O here, no UUID generation)
+            # pred_wavs is already [batch, channels, samples] format
+            # Move to CPU and convert to float32 for return
+            audio_tensors = []
             for i in range(actual_batch_size):
+                # Extract audio tensor: [channels, samples] format, CPU, float32
+                audio_tensor = pred_wavs[i].cpu().float()
+                audio_tensors.append(audio_tensor)
             status_message = f"✅ Generation completed successfully!"
+            logger.info(f"[generate_music] Done! Generated {len(audio_tensors)} audio tensors.")
+            # Extract intermediate information from outputs
+            src_latents = outputs.get("src_latents")  # [batch, T, D]
+            target_latents_input = outputs.get("target_latents_input")  # [batch, T, D]
+            chunk_masks = outputs.get("chunk_masks")  # [batch, T]
+            spans = outputs.get("spans", [])  # List of tuples
+            latent_masks = outputs.get("latent_masks")  # [batch, T]
+            # Move latents to CPU to save memory (they can be large)
+            extra_outputs = {
+                "pred_latents": pred_latents.cpu() if pred_latents is not None else None,
+                "target_latents": target_latents_input.cpu() if target_latents_input is not None else None,
+                "src_latents": src_latents.cpu() if src_latents is not None else None,
+                "chunk_masks": chunk_masks.cpu() if chunk_masks is not None else None,
+                "latent_masks": latent_masks.cpu() if latent_masks is not None else None,
+                "spans": spans,
+                "time_costs": time_costs,
+                "seed_value": seed_value_for_ui,
+            }
+            # Build audios list with tensor data (no file paths, no UUIDs, handled outside)
+            audios = []
+            for idx, audio_tensor in enumerate(audio_tensors):
+                audio_dict = {
+                    "tensor": audio_tensor,  # torch.Tensor [channels, samples], CPU, float32
+                    "sample_rate": self.sample_rate,
+                }
+                audios.append(audio_dict)
+            return {
+                "audios": audios,
+                "status_message": status_message,
+                "extra_outputs": extra_outputs,
+                "success": True,
+                "error": None,
+            }
         except Exception as e:
             error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+            logger.exception("[generate_music] Generation failed")
+            return {
+                "audios": [],
+                "status_message": error_msg,
+                "extra_outputs": {},
+                "success": False,
+                "error": str(e),
+            }

acestep/inference.py CHANGED Viewed

@@ -7,105 +7,100 @@ backward-compatible Gradio UI support.
 """
 import math
 from typing import Optional, Union, List, Dict, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from loguru import logger
-import time as time_module
 @dataclass
-class GenerationConfig:
-    """Configuration for music generation.
     Attributes:
         # Text Inputs
-        caption: Text description of the desired music
-        lyrics: Lyrics text for vocal music (use "[Instrumental]" for instrumental)
         # Music Metadata
-        bpm: Beats per minute (e.g., 120). None for auto-detection
-        key_scale: Musical key (e.g., "C Major", "Am"). Empty for auto-detection
-        time_signature: Time signature (e.g., "4/4", "3/4"). Empty for auto-detection
-        vocal_language: Language code for vocals (e.g., "en", "zh", "ja")
-        audio_duration: Duration in seconds. None for auto-detection
         # Generation Parameters
-        inference_steps: Number of denoising steps (8 for turbo, 32-100 for base)
-        guidance_scale: Classifier-free guidance scale (higher = more adherence to prompt)
-        use_random_seed: Whether to use random seed (True) or fixed seed
-        seed: Random seed for reproducibility (-1 for random)
-        batch_size: Number of samples to generate (1-8)
         # Advanced DiT Parameters
-        use_adg: Use Adaptive Dual Guidance (base model only)
-        cfg_interval_start: CFG application start ratio (0.0-1.0)
-        cfg_interval_end: CFG application end ratio (0.0-1.0)
-        audio_format: Output audio format ("mp3", "wav", "flac")
         # Task-Specific Parameters
-        task_type: Generation task type ("text2music", "cover", "repaint", "lego", "extract", "complete")
-        reference_audio: Path to reference audio file (for style transfer)
-        src_audio: Path to source audio file (for audio-to-audio tasks)
-        audio_code_string: Pre-extracted audio codes (advanced use)
-        repainting_start: Repainting start time in seconds (for repaint/lego tasks)
-        repainting_end: Repainting end time in seconds (-1 for end of audio)
-        audio_cover_strength: Strength of audio cover/codes influence (0.0-1.0)
-        instruction: Task-specific instruction prompt (auto-generated if empty)
-        # 5Hz Language Model Parameters (CoT Reasoning)
-        use_llm_thinking: Enable LM-based Chain-of-Thought reasoning for metadata/codes
-        lm_temperature: LM sampling temperature (0.0-2.0, higher = more creative)
-        lm_cfg_scale: LM classifier-free guidance scale
-        lm_top_k: LM top-k sampling (0 = disabled)
-        lm_top_p: LM nucleus sampling (1.0 = disabled)
-        lm_negative_prompt: Negative prompt for LM guidance
-        use_cot_metas: Generate metadata using LM CoT
-        use_cot_caption: Refine caption using LM CoT
-        use_cot_language: Detect language using LM CoT
-        is_format_caption: Whether caption is already formatted
-        constrained_decoding_debug: Enable debug logging for constrained decoding
-        # Batch LM Generation
-        allow_lm_batch: Allow batch LM code generation (faster for batch_size >= 2)
-        lm_batch_chunk_size: Maximum batch size per LM inference chunk (GPU memory constraint)
     """
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
-    # Music Metadata
-    bpm: Optional[int] = None
-    key_scale: str = ""
-    time_signature: str = ""
     vocal_language: str = "unknown"
-    audio_duration: Optional[float] = None
-    # Generation Parameters
     inference_steps: int = 8
-    guidance_scale: float = 7.0
-    use_random_seed: bool = True
     seed: int = -1
-    batch_size: int = 1
-    # Advanced DiT Parameters
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
-    audio_format: str = "mp3"
-    # Task-Specific Parameters
-    task_type: str = "text2music"
-    reference_audio: Optional[str] = None
-    src_audio: Optional[str] = None
-    audio_code_string: Union[str, List[str]] = ""
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
-    instruction: str = ""
     # 5Hz Language Model Parameters
-    use_llm_thinking: bool = False
     lm_temperature: float = 0.85
     lm_cfg_scale: float = 2.0
     lm_top_k: int = 0
@@ -113,13 +108,50 @@ class GenerationConfig:
     lm_negative_prompt: str = "NO USER INPUT"
     use_cot_metas: bool = True
     use_cot_caption: bool = True
     use_cot_language: bool = True
-    is_format_caption: bool = False
-    constrained_decoding_debug: bool = False
-    # Batch LM Generation
     allow_lm_batch: bool = False
-    lm_batch_chunk_size: int = 4
 @dataclass
@@ -128,801 +160,461 @@ class GenerationResult:
     Attributes:
         # Audio Outputs
-        audio_paths: List of paths to generated audio files
-        first_audio: Path to first generated audio (backward compatibility)
-        second_audio: Path to second generated audio (backward compatibility)
-        # Generation Information
-        generation_info: Markdown-formatted generation information
         status_message: Status message from generation
-        seed_value: Actual seed value used for generation
-        # LM-Generated Metadata (if applicable)
-        lm_metadata: Metadata generated by language model (dict or None)
-        # Audio-Text Alignment Scores (if available)
-        align_score_1: First alignment score
-        align_text_1: First alignment text description
-        align_plot_1: First alignment plot image
-        align_score_2: Second alignment score
-        align_text_2: Second alignment text description
-        align_plot_2: Second alignment plot image
-        # Success Status
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
-    audio_paths: List[str] = field(default_factory=list)
-    first_audio: Optional[str] = None
-    second_audio: Optional[str] = None
     # Generation Information
-    generation_info: str = ""
     status_message: str = ""
-    seed_value: str = ""
-    # LM-Generated Metadata
-    lm_metadata: Optional[Dict[str, Any]] = None
-    # Audio-Text Alignment Scores
-    align_score_1: Optional[float] = None
-    align_text_1: Optional[str] = None
-    align_plot_1: Optional[Any] = None
-    align_score_2: Optional[float] = None
-    align_text_2: Optional[str] = None
-    align_plot_2: Optional[Any] = None
     # Success Status
     success: bool = True
     error: Optional[str] = None
     def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for JSON serialization."""
         return asdict(self)
 def generate_music(
     dit_handler,
     llm_handler,
     config: GenerationConfig,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
-    This is the main inference API for music generation. It supports various task types
-    (text2music, cover, repaint, etc.) and can optionally use a 5Hz Language Model for
-    Chain-of-Thought reasoning to generate metadata and audio codes.
     Args:
         dit_handler: Initialized DiT model handler (AceStepHandler instance)
         llm_handler: Initialized LLM handler (LLMHandler instance)
         config: Generation configuration (GenerationConfig instance)
     Returns:
-        GenerationResult: Generation result containing audio paths and metadata
-    Example:
-        >>> from acestep.handler import AceStepHandler
-        >>> from acestep.llm_inference import LLMHandler
-        >>> from acestep.inference import GenerationConfig, generate_music
-        >>>
-        >>> # Initialize handlers
-        >>> dit_handler = AceStepHandler()
-        >>> llm_handler = LLMHandler()
-        >>> dit_handler.initialize_service(...)
-        >>> llm_handler.initialize(...)
-        >>>
-        >>> # Configure generation
-        >>> config = GenerationConfig(
-        ...     caption="upbeat electronic dance music",
-        ...     bpm=128,
-        ...     audio_duration=30,
-        ...     batch_size=2,
-        ... )
-        >>>
-        >>> # Generate music
-        >>> result = generate_music(dit_handler, llm_handler, config)
-        >>> print(f"Generated {len(result.audio_paths)} audio files")
-        >>> for path in result.audio_paths:
-        ...     print(f"Audio: {path}")
     """
     try:
         # Phase 1: LM-based metadata and code generation (if enabled)
-        audio_code_string_to_use = config.audio_code_string
         lm_generated_metadata = None
-        lm_generated_audio_codes = None
         lm_generated_audio_codes_list = []
         # Extract mutable copies of metadata (will be updated by LM if needed)
-        bpm = config.bpm
-        key_scale = config.key_scale
-        time_signature = config.time_signature
-        audio_duration = config.audio_duration
-        # Determine if we should use batch LM generation
-        should_use_lm_batch = (
-            config.use_llm_thinking
-            and llm_handler.llm_initialized
-            and config.use_cot_metas
-            and config.allow_lm_batch
-            and config.batch_size >= 2
-        )
         # LM-based Chain-of-Thought reasoning
-        if config.use_llm_thinking and llm_handler.llm_initialized and config.use_cot_metas:
-            # Convert sampling parameters
-            top_k_value = None if config.lm_top_k == 0 else int(config.lm_top_k)
-            top_p_value = None if config.lm_top_p >= 1.0 else config.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
             if bpm is not None:
                 try:
                     bpm_value = float(bpm)
                     if bpm_value > 0:
-                        user_metadata['bpm'] = str(int(bpm_value))
                 except (ValueError, TypeError):
                     pass
             if key_scale and key_scale.strip():
                 key_scale_clean = key_scale.strip()
                 if key_scale_clean.lower() not in ["n/a", ""]:
                     user_metadata['keyscale'] = key_scale_clean
             if time_signature and time_signature.strip():
                 time_sig_clean = time_signature.strip()
                 if time_sig_clean.lower() not in ["n/a", ""]:
                     user_metadata['timesignature'] = time_sig_clean
             if audio_duration is not None:
                 try:
                     duration_value = float(audio_duration)
                     if duration_value > 0:
-                        user_metadata['duration'] = str(int(duration_value))
                 except (ValueError, TypeError):
                     pass
             user_metadata_to_pass = user_metadata if user_metadata else None
-            # Batch LM generation (faster for multiple samples)
-            if should_use_lm_batch:
-                actual_seed_list, _ = dit_handler.prepare_seeds(
-                    config.batch_size, config.seed, config.use_random_seed
-                )
-                max_inference_batch_size = int(config.lm_batch_chunk_size)
-                num_chunks = math.ceil(config.batch_size / max_inference_batch_size)
-                all_metadata_list = []
-                all_audio_codes_list = []
-                for chunk_idx in range(num_chunks):
-                    chunk_start = chunk_idx * max_inference_batch_size
-                    chunk_end = min(chunk_start + max_inference_batch_size, config.batch_size)
-                    chunk_size = chunk_end - chunk_start
-                    chunk_seeds = actual_seed_list[chunk_start:chunk_end]
-                    logger.info(
-                        f"LM batch chunk {chunk_idx+1}/{num_chunks} "
-                        f"(size: {chunk_size}, seeds: {chunk_seeds})"
-                    )
-                    metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition_batch(
-                        caption=config.caption or "",
-                        lyrics=config.lyrics or "",
-                        batch_size=chunk_size,
-                        infer_type="llm_dit",
-                        temperature=config.lm_temperature,
-                        cfg_scale=config.lm_cfg_scale,
-                        negative_prompt=config.lm_negative_prompt,
-                        top_k=top_k_value,
-                        top_p=top_p_value,
-                        user_metadata=user_metadata_to_pass,
-                        use_cot_caption=config.use_cot_caption,
-                        use_cot_language=config.use_cot_language,
-                        is_format_caption=config.is_format_caption,
-                        constrained_decoding_debug=config.constrained_decoding_debug,
-                        seeds=chunk_seeds,
-                    )
-                    all_metadata_list.extend(metadata_list)
-                    all_audio_codes_list.extend(audio_codes_list)
-                lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
-                lm_generated_audio_codes_list = all_audio_codes_list
-                audio_code_string_to_use = all_audio_codes_list
-                # Update metadata from LM if not provided by user
-                if lm_generated_metadata:
-                    bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
-                        lm_generated_metadata, bpm, key_scale, time_signature, audio_duration
-                    )
-            else:
-                # Sequential LM generation (current behavior)
-                # Phase 1: Generate CoT metadata
-                phase1_start = time_module.time()
-                metadata, _, status = llm_handler.generate_with_stop_condition(
-                    caption=config.caption or "",
-                    lyrics=config.lyrics or "",
-                    infer_type="dit",
-                    temperature=config.lm_temperature,
-                    cfg_scale=config.lm_cfg_scale,
-                    negative_prompt=config.lm_negative_prompt,
-                    top_k=top_k_value,
-                    top_p=top_p_value,
-                    user_metadata=user_metadata_to_pass,
-                    use_cot_caption=config.use_cot_caption,
-                    use_cot_language=config.use_cot_language,
-                    is_format_caption=config.is_format_caption,
-                    constrained_decoding_debug=config.constrained_decoding_debug,
-                )
-                lm_phase1_time = time_module.time() - phase1_start
-                logger.info(f"LM Phase 1 (CoT) completed in {lm_phase1_time:.2f}s")
-                # Phase 2: Generate audio codes
-                phase2_start = time_module.time()
-                metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
-                    caption=config.caption or "",
-                    lyrics=config.lyrics or "",
-                    infer_type="llm_dit",
-                    temperature=config.lm_temperature,
-                    cfg_scale=config.lm_cfg_scale,
-                    negative_prompt=config.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
                     user_metadata=user_metadata_to_pass,
-                    use_cot_caption=config.use_cot_caption,
-                    use_cot_language=config.use_cot_language,
-                    is_format_caption=config.is_format_caption,
                     constrained_decoding_debug=config.constrained_decoding_debug,
                 )
-                lm_phase2_time = time_module.time() - phase2_start
-                logger.info(f"LM Phase 2 (Codes) completed in {lm_phase2_time:.2f}s")
-                lm_generated_metadata = metadata
-                if audio_codes:
-                    audio_code_string_to_use = audio_codes
-                    lm_generated_audio_codes = audio_codes
-                    # Update metadata from LM if not provided by user
-                    bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
-                        metadata, bpm, key_scale, time_signature, audio_duration
                     )
         # Phase 2: DiT music generation
         result = dit_handler.generate_music(
-            captions=config.caption,
-            lyrics=config.lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
-            vocal_language=config.vocal_language,
-            inference_steps=config.inference_steps,
-            guidance_scale=config.guidance_scale,
             use_random_seed=config.use_random_seed,
-            seed=config.seed,
-            reference_audio=config.reference_audio,
             audio_duration=audio_duration,
-            batch_size=config.batch_size,
-            src_audio=config.src_audio,
             audio_code_string=audio_code_string_to_use,
-            repainting_start=config.repainting_start,
-            repainting_end=config.repainting_end,
-            instruction=config.instruction,
-            audio_cover_strength=config.audio_cover_strength,
-            task_type=config.task_type,
-            use_adg=config.use_adg,
-            cfg_interval_start=config.cfg_interval_start,
-            cfg_interval_end=config.cfg_interval_end,
-            audio_format=config.audio_format,
-            lm_temperature=config.lm_temperature,
         )
-        # Extract results
-        (first_audio, second_audio, all_audio_paths, generation_info, status_message,
-         seed_value, align_score_1, align_text_1, align_plot_1,
-         align_score_2, align_text_2, align_plot_2) = result
-        # Append LM metadata to generation info
-        if lm_generated_metadata:
-            generation_info = _append_lm_metadata_to_info(generation_info, lm_generated_metadata)
-        # Create result object
         return GenerationResult(
-            audio_paths=all_audio_paths or [],
-            first_audio=first_audio,
-            second_audio=second_audio,
-            generation_info=generation_info,
             status_message=status_message,
-            seed_value=seed_value,
-            lm_metadata=lm_generated_metadata,
-            align_score_1=align_score_1,
-            align_text_1=align_text_1,
-            align_plot_1=align_plot_1,
-            align_score_2=align_score_2,
-            align_text_2=align_text_2,
-            align_plot_2=align_plot_2,
             success=True,
             error=None,
         )
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
             success=False,
             error=str(e),
-            generation_info=f"❌ Generation failed: {str(e)}",
-            status_message=f"Error: {str(e)}",
         )
-def _update_metadata_from_lm(
-    metadata: Dict[str, Any],
-    bpm: Optional[int],
-    key_scale: str,
-    time_signature: str,
-    audio_duration: Optional[float],
-) -> Tuple[Optional[int], str, str, Optional[float]]:
-    """Update metadata fields from LM output if not provided by user."""
-    if bpm is None and metadata.get('bpm'):
-        bpm_value = metadata.get('bpm')
-        if bpm_value not in ["N/A", ""]:
-            try:
-                bpm = int(bpm_value)
-            except (ValueError, TypeError):
-                pass
-    if not key_scale and metadata.get('keyscale'):
-        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
-        if key_scale_value != "N/A":
-            key_scale = key_scale_value
-    if not time_signature and metadata.get('timesignature'):
-        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
-        if time_signature_value != "N/A":
-            time_signature = time_signature_value
-    if audio_duration is None or audio_duration <= 0:
-        audio_duration_value = metadata.get('duration', -1)
-        if audio_duration_value not in ["N/A", ""]:
-            try:
-                audio_duration = float(audio_duration_value)
-            except (ValueError, TypeError):
-                pass
-    return bpm, key_scale, time_signature, audio_duration
-def _append_lm_metadata_to_info(generation_info: str, metadata: Dict[str, Any]) -> str:
-    """Append LM-generated metadata to generation info string."""
-    metadata_lines = []
-    if metadata.get('bpm'):
-        metadata_lines.append(f"- **BPM:** {metadata['bpm']}")
-    if metadata.get('caption'):
-        metadata_lines.append(f"- **Refined Caption:** {metadata['caption']}")
-    if metadata.get('duration'):
-        metadata_lines.append(f"- **Duration:** {metadata['duration']} seconds")
-    if metadata.get('keyscale'):
-        metadata_lines.append(f"- **Key Scale:** {metadata['keyscale']}")
-    if metadata.get('language'):
-        metadata_lines.append(f"- **Language:** {metadata['language']}")
-    if metadata.get('timesignature'):
-        metadata_lines.append(f"- **Time Signature:** {metadata['timesignature']}")
-    if metadata_lines:
-        metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
-        return metadata_section + "\n\n" + generation_info
-    return generation_info
-# ============================================================================
-# LEGACY GRADIO UI COMPATIBILITY LAYER
-# ============================================================================
-def generate(
-    dit_handler,
-    llm_handler,
-    captions,
-    lyrics,
-    bpm,
-    key_scale,
-    time_signature,
-    vocal_language,
-    inference_steps,
-    guidance_scale,
-    random_seed_checkbox,
-    seed,
-    reference_audio,
-    audio_duration,
-    batch_size_input,
-    src_audio,
-    text2music_audio_code_string,
-    repainting_start,
-    repainting_end,
-    instruction_display_gen,
-    audio_cover_strength,
-    task_type,
-    use_adg,
-    cfg_interval_start,
-    cfg_interval_end,
-    audio_format,
-    lm_temperature,
-    think_checkbox,
-    lm_cfg_scale,
-    lm_top_k,
-    lm_top_p,
-    lm_negative_prompt,
-    use_cot_metas,
-    use_cot_caption,
-    use_cot_language,
-    is_format_caption,
-    constrained_decoding_debug,
-    allow_lm_batch,
-    lm_batch_chunk_size,
-):
-    """Legacy Gradio UI compatibility wrapper.
-    This function maintains backward compatibility with the Gradio UI.
-    For new integrations, use generate_music() with GenerationConfig instead.
-    Returns:
-        Tuple with 28 elements for Gradio UI component updates
-    """
-    # Convert legacy parameters to new config
-    config = GenerationConfig(
-        caption=captions,
-        lyrics=lyrics,
-        bpm=bpm,
-        key_scale=key_scale,
-        time_signature=time_signature,
-        vocal_language=vocal_language,
-        audio_duration=audio_duration,
-        inference_steps=inference_steps,
-        guidance_scale=guidance_scale,
-        use_random_seed=random_seed_checkbox,
-        seed=seed,
-        batch_size=batch_size_input,
-        use_adg=use_adg,
-        cfg_interval_start=cfg_interval_start,
-        cfg_interval_end=cfg_interval_end,
-        audio_format=audio_format,
-        task_type=task_type,
-        reference_audio=reference_audio,
-        src_audio=src_audio,
-        audio_code_string=text2music_audio_code_string,
-        repainting_start=repainting_start,
-        repainting_end=repainting_end,
-        audio_cover_strength=audio_cover_strength,
-        instruction=instruction_display_gen,
-        use_llm_thinking=think_checkbox,
-        lm_temperature=lm_temperature,
-        lm_cfg_scale=lm_cfg_scale,
-        lm_top_k=lm_top_k,
-        lm_top_p=lm_top_p,
-        lm_negative_prompt=lm_negative_prompt,
-        use_cot_metas=use_cot_metas,
-        use_cot_caption=use_cot_caption,
-        use_cot_language=use_cot_language,
-        is_format_caption=is_format_caption,
-        constrained_decoding_debug=constrained_decoding_debug,
-        allow_lm_batch=allow_lm_batch,
-        lm_batch_chunk_size=lm_batch_chunk_size,
-    )
-    # Call new API
-    result = generate_music(dit_handler, llm_handler, config)
-    # Determine which codes to update in UI
-    if config.allow_lm_batch and result.lm_metadata:
-        # Batch mode: extract codes from metadata if available
-        lm_codes_list = result.lm_metadata.get('audio_codes_list', [])
-        updated_audio_codes = lm_codes_list[0] if lm_codes_list else text2music_audio_code_string
-        codes_outputs = (lm_codes_list + [""] * 8)[:8]
-    else:
-        # Single mode
-        lm_codes = result.lm_metadata.get('audio_codes', '') if result.lm_metadata else ''
-        updated_audio_codes = lm_codes if lm_codes else text2music_audio_code_string
-        codes_outputs = [""] * 8
-    # Prepare audio outputs (up to 8)
-    audio_outputs = (result.audio_paths + [None] * 8)[:8]
-    # Return tuple for Gradio UI (28 elements)
-    return (
-        audio_outputs[0],  # generated_audio_1
-        audio_outputs[1],  # generated_audio_2
-        audio_outputs[2],  # generated_audio_3
-        audio_outputs[3],  # generated_audio_4
-        audio_outputs[4],  # generated_audio_5
-        audio_outputs[5],  # generated_audio_6
-        audio_outputs[6],  # generated_audio_7
-        audio_outputs[7],  # generated_audio_8
-        result.audio_paths,  # generated_audio_batch
-        result.generation_info,
-        result.status_message,
-        result.seed_value,
-        result.align_score_1,
-        result.align_text_1,
-        result.align_plot_1,
-        result.align_score_2,
-        result.align_text_2,
-        result.align_plot_2,
-        updated_audio_codes,  # Update main audio codes in UI
-        codes_outputs[0],  # text2music_audio_code_string_1
-        codes_outputs[1],  # text2music_audio_code_string_2
-        codes_outputs[2],  # text2music_audio_code_string_3
-        codes_outputs[3],  # text2music_audio_code_string_4
-        codes_outputs[4],  # text2music_audio_code_string_5
-        codes_outputs[5],  # text2music_audio_code_string_6
-        codes_outputs[6],  # text2music_audio_code_string_7
-        codes_outputs[7],  # text2music_audio_code_string_8
-        result.lm_metadata,  # Store metadata for "Send to src audio" buttons
-        is_format_caption,  # Keep is_format_caption unchanged
-    )
-# ============================================================================
-# TESTING & EXAMPLES
-# ============================================================================
-if __name__ == "__main__":
-    """
-    Test suite for the inference API.
-    Demonstrates various usage scenarios and validates functionality.
-    Usage:
-        python -m acestep.inference
-    """
-    import os
-    import json
-    from acestep.handler import AceStepHandler
-    from acestep.llm_inference import LLMHandler
-    # Initialize paths
-    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-    checkpoint_dir = os.path.join(project_root, "checkpoints")
-    print("=" * 80)
-    print("ACE-Step Inference API Test Suite")
-    print("=" * 80)
-    # ========================================================================
-    # Initialize Handlers
-    # ========================================================================
-    print("\n[1/3] Initializing handlers...")
-    dit_handler = AceStepHandler(save_root="./")
-    llm_handler = LLMHandler()
-    try:
-        # Initialize DiT handler
-        print("  - Initializing DiT model...")
-        status_dit, success_dit = dit_handler.initialize_service(
-            project_root=project_root,
-            config_path="acestep-v15-turbo-rl",
-            device="cuda",
-        )
-        if not success_dit:
-            print(f"  ❌ DiT initialization failed: {status_dit}")
-            exit(1)
-        print(f"  ✓ DiT model initialized successfully")
-        # Initialize LLM handler
-        print("  - Initializing 5Hz LM model...")
-        status_llm, success_llm = llm_handler.initialize(
-            checkpoint_dir=checkpoint_dir,
-            lm_model_path="acestep-5Hz-lm-0.6B-v3",
-            backend="vllm",
-            device="cuda",
-        )
-        if success_llm:
-            print(f"  ✓ LM model initialized successfully")
-        else:
-            print(f"  ⚠ LM initialization failed (will skip LM tests): {status_llm}")
-    except Exception as e:
-        print(f"  ❌ Initialization error: {e}")
-        exit(1)
-    # ========================================================================
-    # Helper Functions
-    # ========================================================================
-    def load_example_config(example_file: str) -> GenerationConfig:
-        """Load configuration from an example JSON file."""
-        try:
-            with open(example_file, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            # Convert example format to GenerationConfig
-            # Handle time signature format (example uses "4" instead of "4/4")
-            time_sig = data.get('timesignature', '')
-            if time_sig and '/' not in time_sig:
-                time_sig = f"{time_sig}/4"  # Default to /4 if only numerator given
-            config = GenerationConfig(
-                caption=data.get('caption', ''),
-                lyrics=data.get('lyrics', ''),
-                bpm=data.get('bpm'),
-                key_scale=data.get('keyscale', ''),
-                time_signature=time_sig,
-                vocal_language=data.get('language', 'unknown'),
-                audio_duration=data.get('duration'),
-                use_llm_thinking=data.get('think', False),
-                batch_size=data.get('batch_size', 1),
-                inference_steps=data.get('inference_steps', 8),
-            )
-            return config
-        except Exception as e:
-            print(f"  ⚠ Failed to load example file: {e}")
-            return None
-    # ========================================================================
-    # Test Cases
-    # ========================================================================
-    test_results = []
-    def run_test(test_name: str, config: GenerationConfig, expected_outputs: int = 1):
-        """Run a single test case and collect results."""
-        print(f"\n{'=' * 80}")
-        print(f"Test: {test_name}")
-        print(f"{'=' * 80}")
-        # Display configuration
-        print("\nConfiguration:")
-        print(f"  Task Type: {config.task_type}")
-        print(f"  Caption: {config.caption[:60]}..." if len(config.caption) > 60 else f"  Caption: {config.caption}")
-        if config.lyrics:
-            print(f"  Lyrics: {config.lyrics[:60]}..." if len(config.lyrics) > 60 else f"  Lyrics: {config.lyrics}")
-        if config.bpm:
-            print(f"  BPM: {config.bpm}")
-        if config.key_scale:
-            print(f"  Key Scale: {config.key_scale}")
-        if config.time_signature:
-            print(f"  Time Signature: {config.time_signature}")
-        if config.audio_duration:
-            print(f"  Duration: {config.audio_duration}s")
-        print(f"  Batch Size: {config.batch_size}")
-        print(f"  Inference Steps: {config.inference_steps}")
-        print(f"  Use LLM Thinking: {config.use_llm_thinking}")
-        # Run generation
-        print("\nGenerating...")
-        import time
-        start_time = time.time()
-        result = generate_music(dit_handler, llm_handler, config)
-        elapsed_time = time.time() - start_time
-        # Display results
-        print("\nResults:")
-        print(f"  Success: {'✓' if result.success else '✗'}")
-        if result.success:
-            print(f"  Generated Files: {len(result.audio_paths)}")
-            for i, path in enumerate(result.audio_paths, 1):
-                if os.path.exists(path):
-                    file_size = os.path.getsize(path) / (1024 * 1024)  # MB
-                    print(f"    [{i}] {os.path.basename(path)} ({file_size:.2f} MB)")
-                else:
-                    print(f"    [{i}] {os.path.basename(path)} (file not found)")
-            print(f"  Seed: {result.seed_value}")
-            print(f"  Generation Time: {elapsed_time:.2f}s")
-            # Display LM metadata if available
-            if result.lm_metadata:
-                print(f"\n  LM-Generated Metadata:")
-                for key, value in result.lm_metadata.items():
-                    if key not in ['audio_codes', 'audio_codes_list']:  # Skip large code strings
-                        print(f"    {key}: {value}")
-            # Validate outputs
-            if len(result.audio_paths) != expected_outputs:
-                print(f"  ⚠ Warning: Expected {expected_outputs} outputs, got {len(result.audio_paths)}")
-                success = False
-            else:
-                success = True
-        else:
-            print(f"  Error: {result.error}")
-            success = False
-        # Store test result
-        test_results.append({
-            "test_name": test_name,
-            "success": success,
-            "generation_success": result.success,
-            "num_outputs": len(result.audio_paths) if result.success else 0,
-            "expected_outputs": expected_outputs,
-            "elapsed_time": elapsed_time,
-            "error": result.error if not result.success else None,
-        })
-        return result
-    # ========================================================================
-    # Test: Production Example (from examples directory)
-    # ========================================================================
-    print("\n[2/3] Running Test...")
-    # Load production example (J-Rock song from examples/text2music/example_05.json)
-    example_file = os.path.join(project_root, "examples", "text2music", "example_05.json")
-    if not os.path.exists(example_file):
-        print(f"\n  ❌ Example file not found: {example_file}")
-        print("     Please ensure the examples directory exists.")
-        exit(1)
-    print(f"  Loading example: {os.path.basename(example_file)}")
-    config = load_example_config(example_file)
-    if not config:
-        print("  ❌ Failed to load example configuration")
-        exit(1)
-    # Reduce duration for faster testing (original is 200s)
-    print(f"  Original duration: {config.audio_duration}s")
-    config.audio_duration = 30
-    config.use_random_seed = False
-    config.seed = 42
-    print(f"  Test duration: {config.audio_duration}s (reduced for testing)")
-    run_test("Production Example (J-Rock Song)", config, expected_outputs=1)
-    # ========================================================================
-    # Test Summary
-    # ========================================================================
-    print("\n[3/3] Test Summary")
-    print("=" * 80)
-    if len(test_results) == 0:
-        print("No tests were run.")
-        exit(1)
-    result = test_results[0]
-    print(f"\nTest: {result['test_name']}")
-    print(f"Status: {'✓ PASS' if result['success'] else '✗ FAIL'}")
-    print(f"Generation: {'Success' if result['generation_success'] else 'Failed'}")
-    print(f"Outputs: {result['num_outputs']}/{result['expected_outputs']}")
-    print(f"Time: {result['elapsed_time']:.2f}s")
-    if result["error"]:
-        print(f"Error: {result['error']}")
-    # Save test results to JSON
-    results_file = os.path.join(project_root, "test_results.json")
-    try:
-        with open(results_file, "w") as f:
-            json.dump({
-                "test_name": result['test_name'],
-                "success": result['success'],
-                "generation_success": result['generation_success'],
-                "num_outputs": result['num_outputs'],
-                "expected_outputs": result['expected_outputs'],
-                "elapsed_time": result['elapsed_time'],
-                "error": result['error'],
-            }, f, indent=2)
-        print(f"\n✓ Test results saved to: {results_file}")
-    except Exception as e:
-        print(f"\n⚠ Failed to save test results: {e}")
-    # Exit with appropriate code
-    print("\n" + "=" * 80)
-    if result['success']:
-        print("Test passed! ✓")
-        print("=" * 80)
-        exit(0)
-    else:
-        print("Test failed! ✗")
-        print("=" * 80)
-        exit(1)

 """
 import math
+import os
+import tempfile
 from typing import Optional, Union, List, Dict, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from loguru import logger
+from acestep.audio_utils import AudioSaver, generate_uuid_from_params
 @dataclass
+class GenerationParams:
+    """Configuration for music generation parameters.
     Attributes:
         # Text Inputs
+        caption: A short text prompt describing the desired music (main prompt). < 512 characters
+        lyrics: Lyrics for the music. Use "[Instrumental]" for instrumental songs. < 4096 characters
+        instrumental: If True, generate instrumental music regardless of lyrics.
         # Music Metadata
+        bpm: BPM (beats per minute), e.g., 120. Set to None for automatic estimation. 30 ~ 300
+        keyscale: Musical key (e.g., "C Major", "Am"). Leave empty for auto-detection. A-G, #/♭, major/minor
+        timesignature: Time signature (2 for '2/4', 3 for '3/4', 4 for '4/4', 6 for '6/8'). Leave empty for auto-detection.
+        vocal_language: Language code for vocals, e.g., "en", "zh", "ja", or "unknown". see acestep/constants.py:VALID_LANGUAGES
+        duration: Target audio length in seconds. If <0 or None, model chooses automatically. 10 ~ 600
         # Generation Parameters
+        inference_steps: Number of diffusion steps (e.g., 8 for turbo, 32–100 for base model).
+        guidance_scale: CFG (classifier-free guidance) strength. Higher means following the prompt more strictly. Only support for non-turbo model.
+        seed: Integer seed for reproducibility. -1 means use random seed each time.
         # Advanced DiT Parameters
+        use_adg: Whether to use Adaptive Dual Guidance (only works for base model).
+        cfg_interval_start: Start ratio (0.0–1.0) to apply CFG.
+        cfg_interval_end: End ratio (0.0–1.0) to apply CFG.
         # Task-Specific Parameters
+        task_type: Type of generation task. One of: "text2music", "cover", "repaint", "lego", "extract", "complete".
+        reference_audio: Path to a reference audio file for style transfer or cover tasks.
+        src_audio: Path to a source audio file for audio-to-audio tasks.
+        audio_codes: Audio semantic codes as a string (advanced use, for code-control generation).
+        repainting_start: For repaint/lego tasks: start time in seconds for region to repaint.
+        repainting_end: For repaint/lego tasks: end time in seconds for region to repaint (-1 for until end).
+        audio_cover_strength: Strength of reference audio/codes influence (range 0.0–1.0). set smaller (0.2) for style transfer tasks.
+        instruction: Optional task instruction prompt. If empty, auto-generated by system.
+        # 5Hz Language Model Parameters for CoT reasoning
+        thinking: If True, enable 5Hz Language Model "Chain-of-Thought" reasoning for semantic/music metadata and codes.
+        lm_temperature: Sampling temperature for the LLM (0.0–2.0). Higher = more creative/varied results.
+        lm_cfg_scale: Classifier-free guidance scale for the LLM.
+        lm_top_k: LLM top-k sampling (0 = disabled).
+        lm_top_p: LLM top-p nucleus sampling (1.0 = disabled).
+        lm_negative_prompt: Negative prompt to use for LLM (for control).
+        use_cot_metas: Whether to let LLM generate music metadata via CoT reasoning.
+        use_cot_caption: Whether to let LLM rewrite or format the input caption via CoT reasoning.
+        use_cot_language: Whether to let LLM detect vocal language via CoT.
     """
+    # Required Inputs
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # Audio Uploads
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM Codes Hints
+    audio_codes: str = ""
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
+    instrumental: bool = False
+    # Metadata
     vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # Advanced Settings
     inference_steps: int = 8
     seed: int = -1
+    guidance_scale: float = 7.0
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
     # 5Hz Language Model Parameters
+    thinking: bool = True
     lm_temperature: float = 0.85
     lm_cfg_scale: float = 2.0
     lm_top_k: int = 0
     lm_negative_prompt: str = "NO USER INPUT"
     use_cot_metas: bool = True
     use_cot_caption: bool = True
+    use_cot_lyrics: bool = False  # TODO: not used yet
     use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationConfig:
+    """Configuration for music generation.
+    Attributes:
+        batch_size: Number of audio samples to generate
+        allow_lm_batch: Whether to allow batch processing in LM
+        use_random_seed: Whether to use random seed
+        seeds: Seed(s) for batch generation. Can be:
+            - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
+            - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
+            - int: Single seed value (will be converted to list and padded)
+        lm_batch_chunk_size: Batch chunk size for LM processing
+        constrained_decoding_debug: Whether to enable constrained decoding debug
+        audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
+    """
+    batch_size: int = 2
     allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"  # Default to FLAC for fast saving
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
 @dataclass
     Attributes:
         # Audio Outputs
+        audios: List of audio dictionaries with paths, keys, params
         status_message: Status message from generation
+        extra_outputs: Extra outputs from generation
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
+    audios: List[Dict[str, Any]] = field(default_factory=list)
     # Generation Information
     status_message: str = ""
+    extra_outputs: Dict[str, Any] = field(default_factory=dict)
     # Success Status
     success: bool = True
     error: Optional[str] = None
     def to_dict(self) -> Dict[str, Any]:
         """Convert result to dictionary for JSON serialization."""
         return asdict(self)
+def _update_metadata_from_lm(
+    metadata: Dict[str, Any],
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    audio_duration: Optional[float],
+    vocal_language: str,
+    caption: str,
+    lyrics: str,
+) -> Tuple[Optional[int], str, str, Optional[float]]:
+    """Update metadata fields from LM output if not provided by user."""
+    if bpm is None and metadata.get('bpm'):
+        bpm_value = metadata.get('bpm')
+        if bpm_value not in ["N/A", ""]:
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+    if not key_scale and metadata.get('keyscale'):
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value != "N/A":
+            key_scale = key_scale_value
+    if not time_signature and metadata.get('timesignature'):
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value != "N/A":
+            time_signature = time_signature_value
+    if audio_duration is None or audio_duration <= 0:
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value not in ["N/A", ""]:
+            try:
+                audio_duration = float(audio_duration_value)
+            except (ValueError, TypeError):
+                pass
+    if not vocal_language and metadata.get('vocal_language'):
+        vocal_language = metadata.get('vocal_language')
+    if not caption and metadata.get('caption'):
+        caption = metadata.get('caption')
+    if not lyrics and metadata.get('lyrics'):
+        lyrics = metadata.get('lyrics')
+    return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
 def generate_music(
     dit_handler,
     llm_handler,
+    params: GenerationParams,
     config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
     Args:
         dit_handler: Initialized DiT model handler (AceStepHandler instance)
         llm_handler: Initialized LLM handler (LLMHandler instance)
+        params: Generation parameters (GenerationParams instance)
         config: Generation configuration (GenerationConfig instance)
     Returns:
+        GenerationResult with generated audio files and metadata
     """
     try:
         # Phase 1: LM-based metadata and code generation (if enabled)
+        audio_code_string_to_use = params.audio_codes
         lm_generated_metadata = None
         lm_generated_audio_codes_list = []
+        lm_total_time_costs = {
+            "phase1_time": 0.0,
+            "phase2_time": 0.0,
+            "total_time": 0.0,
+        }
         # Extract mutable copies of metadata (will be updated by LM if needed)
+        bpm = params.bpm
+        key_scale = params.keyscale
+        time_signature = params.timesignature
+        audio_duration = params.duration
+        dit_input_caption = params.caption
+        dit_input_vocal_language = params.vocal_language
+        dit_input_lyrics = params.lyrics
+        # Determine if we need to generate audio codes
+        # If user has provided audio_codes, we don't need to generate them
+        # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
+        user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
+        # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
+        # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
+        # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
+        # Note: This logic can be refined based on specific requirements
+        need_audio_codes = not user_provided_audio_codes
+        # Determine if we should use chunk-based LM generation (always use chunks for consistency)
+        # Determine actual batch size for chunk processing
+        actual_batch_size = config.batch_size if config.batch_size is not None else 1
+        # Prepare seeds for batch generation
+        # Use config.seed if provided, otherwise fallback to params.seed
+        # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = ""
+        if config.seeds is not None and len(config.seeds) > 0:
+            if isinstance(config.seeds, list):
+                # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seeds)
+        # Use dit_handler.prepare_seeds to handle seed list generation and padding
+        # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(actual_batch_size, seed_for_generation, config.use_random_seed)
         # LM-based Chain-of-Thought reasoning
+        use_lm = params.thinking and llm_handler.llm_initialized
+        lm_status = []
+        if use_lm:
+            # Convert sampling parameters - handle None values safely
+            top_k_value = None if not params.lm_top_k or params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if not params.lm_top_p or params.lm_top_p >= 1.0 else params.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
             if bpm is not None:
                 try:
                     bpm_value = float(bpm)
                     if bpm_value > 0:
+                        user_metadata['bpm'] = int(bpm_value)
                 except (ValueError, TypeError):
                     pass
             if key_scale and key_scale.strip():
                 key_scale_clean = key_scale.strip()
                 if key_scale_clean.lower() not in ["n/a", ""]:
                     user_metadata['keyscale'] = key_scale_clean
             if time_signature and time_signature.strip():
                 time_sig_clean = time_signature.strip()
                 if time_sig_clean.lower() not in ["n/a", ""]:
                     user_metadata['timesignature'] = time_sig_clean
             if audio_duration is not None:
                 try:
                     duration_value = float(audio_duration)
                     if duration_value > 0:
+                        user_metadata['duration'] = int(duration_value)
                 except (ValueError, TypeError):
                     pass
             user_metadata_to_pass = user_metadata if user_metadata else None
+            # Determine infer_type based on whether we need audio codes
+            # - "llm_dit": generates both metas and audio codes (two-phase internally)
+            # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes else "dit"
+            # Use chunk size from config, or default to batch_size if not set
+            max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
+            num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                            f"(size: {chunk_size}, seeds: {chunk_seeds})")
+                # Use the determined infer_type
+                # - "llm_dit" will internally run two phases (metas + codes)
+                # - "dit" will only run phase 1 (metas only)
+                result = llm_handler.generate_with_stop_condition(
+                    caption=params.caption or "",
+                    lyrics=params.lyrics or "",
+                    infer_type=infer_type,
+                    temperature=params.lm_temperature,
+                    cfg_scale=params.lm_cfg_scale,
+                    negative_prompt=params.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
                     user_metadata=user_metadata_to_pass,
+                    use_cot_caption=params.use_cot_caption,
+                    use_cot_language=params.use_cot_language,
+                    use_cot_metas=params.use_cot_metas,
+                    use_constrained_decoding=params.use_constrained_decoding,
                     constrained_decoding_debug=config.constrained_decoding_debug,
+                    batch_size=chunk_size,
+                    seeds=chunk_seeds,
+                    progress=progress,
                 )
+                # Check if LM generation failed
+                if not result.get("success", False):
+                    error_msg = result.get("error", "Unknown LM error")
+                    lm_status.append(f"❌ LM Error: {error_msg}")
+                    # Return early with error
+                    return GenerationResult(
+                        audios=[],
+                        status_message=f"❌ LM generation failed: {error_msg}",
+                        extra_outputs={},
+                        success=False,
+                        error=error_msg,
                     )
+                # Extract metadata and audio_codes from result dict
+                if chunk_size > 1:
+                    metadata_list = result.get("metadata", [])
+                    audio_codes_list = result.get("audio_codes", [])
+                    all_metadata_list.extend(metadata_list)
+                    all_audio_codes_list.extend(audio_codes_list)
+                else:
+                    metadata = result.get("metadata", {})
+                    audio_codes = result.get("audio_codes", "")
+                    all_metadata_list.append(metadata)
+                    all_audio_codes_list.append(audio_codes)
+                # Collect time costs from LM extra_outputs
+                lm_extra = result.get("extra_outputs", {})
+                lm_chunk_time_costs = lm_extra.get("time_costs", {})
+                if lm_chunk_time_costs:
+                    # Accumulate time costs from all chunks
+                    for key in ["phase1_time", "phase2_time", "total_time"]:
+                        if key in lm_chunk_time_costs:
+                            lm_total_time_costs[key] += lm_chunk_time_costs[key]
+                    time_str = ", ".join([f"{k}: {v:.2f}s" for k, v in lm_chunk_time_costs.items()])
+                    lm_status.append(f"✅ LM chunk {chunk_idx+1}: {time_str}")
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Set audio_code_string_to_use based on infer_type
+            if infer_type == "llm_dit":
+                # If batch mode, use list; otherwise use single string
+                if actual_batch_size > 1:
+                    audio_code_string_to_use = all_audio_codes_list
+                else:
+                    audio_code_string_to_use = all_audio_codes_list[0] if all_audio_codes_list else ""
+            else:
+                # For "dit" mode, keep user-provided codes or empty
+                audio_code_string_to_use = params.audio_codes
+            # Update metadata from LM if not provided by user
+            if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics = _update_metadata_from_lm(
+                    metadata=lm_generated_metadata,
+                    bpm=bpm,
+                    key_scale=key_scale,
+                    time_signature=time_signature,
+                    audio_duration=audio_duration,
+                    vocal_language=dit_input_vocal_language,
+                    caption=dit_input_caption,
+                    lyrics=dit_input_lyrics)
+                if not params.bpm:
+                    params.cot_bpm = bpm
+                if not params.keyscale:
+                    params.cot_keyscale = key_scale
+                if not params.timesignature:
+                    params.cot_timesignature = time_signature
+                if not params.duration:
+                    params.cot_duration = audio_duration
+                if not params.vocal_language:
+                    params.cot_vocal_language = vocal_language
+                if not params.caption:
+                    params.cot_caption = caption
+                if not params.lyrics:
+                    params.cot_lyrics = lyrics
+            # set cot caption and language if needed
+            if params.use_cot_caption:
+                dit_input_caption = lm_generated_metadata.get("caption", dit_input_caption)
+            if params.use_cot_language:
+                dit_input_vocal_language = lm_generated_metadata.get("vocal_language", dit_input_vocal_language)
         # Phase 2: DiT music generation
+        # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
         result = dit_handler.generate_music(
+            captions=dit_input_caption,
+            lyrics=dit_input_lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
+            vocal_language=dit_input_vocal_language,
+            inference_steps=params.inference_steps,
+            guidance_scale=params.guidance_scale,
             use_random_seed=config.use_random_seed,
+            seed=seed_for_generation,  # Use config.seed (or params.seed fallback) instead of params.seed directly
+            reference_audio=params.reference_audio,
             audio_duration=audio_duration,
+            batch_size=config.batch_size if config.batch_size is not None else 1,
+            src_audio=params.src_audio,
             audio_code_string=audio_code_string_to_use,
+            repainting_start=params.repainting_start,
+            repainting_end=params.repainting_end,
+            instruction=params.instruction,
+            audio_cover_strength=params.audio_cover_strength,
+            task_type=params.task_type,
+            use_adg=params.use_adg,
+            cfg_interval_start=params.cfg_interval_start,
+            cfg_interval_end=params.cfg_interval_end,
+            progress=progress,
         )
+        # Check if generation failed
+        if not result.get("success", False):
+            return GenerationResult(
+                audios=[],
+                status_message=result.get("status_message", ""),
+                extra_outputs={},
+                success=False,
+                error=result.get("error"),
+            )
+        # Extract results from dit_handler.generate_music dict
+        dit_audios = result.get("audios", [])
+        status_message = result.get("status_message", "")
+        dit_extra_outputs = result.get("extra_outputs", {})
+        # Use the seed list already prepared above (from config.seed or params.seed fallback)
+        # actual_seed_list was computed earlier using dit_handler.prepare_seeds
+        seed_list = actual_seed_list
+        # Get base params dictionary
+        base_params_dict = params.to_dict()
+        # Save audio files using AudioSaver (format from config)
+        audio_format = config.audio_format if config.audio_format else "flac"
+        audio_saver = AudioSaver(default_format=audio_format)
+        # Use handler's temp_dir for saving files
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+        # Build audios list for GenerationResult with params and save files
+        # Audio saving and UUID generation handled here, outside of handler
+        audios = []
+        for idx, dit_audio in enumerate(dit_audios):
+            # Create a copy of params dict for this audio
+            audio_params = base_params_dict.copy()
+            # Update audio-specific values
+            audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
+            # Add audio codes if batch mode
+            if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
+                audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
+            # Get audio tensor and metadata
+            audio_tensor = dit_audio.get("tensor")
+            sample_rate = dit_audio.get("sample_rate", 48000)
+            # Generate UUID for this audio (moved from handler)
+            batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (
+                lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
+            if isinstance(audio_code_str, list):
+                audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(audio_params)
+            # Save audio file (handled outside handler)
+            audio_path = None
+            if audio_tensor is not None and save_dir is not None:
+                try:
+                    audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(audio_tensor,
+                                                        audio_file,
+                                                        sample_rate=sample_rate,
+                                                        format=audio_format,
+                                                        channels_first=True)
+                except Exception as e:
+                    logger.error(f"[generate_music] Failed to save audio file: {e}")
+                    audio_path = ""  # Fallback to empty path
+            audio_dict = {
+                "path": audio_path or "",  # File path (saved here, not in handler)
+                "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
+                "key": audio_key,
+                "sample_rate": sample_rate,
+                "params": audio_params,
+            }
+            audios.append(audio_dict)
+        # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
+        extra_outputs = dit_extra_outputs.copy()
+        extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Merge time_costs from both LM and DiT into a unified dictionary
+        unified_time_costs = {}
+        # Add LM time costs (if LM was used)
+        if use_lm and lm_total_time_costs:
+            for key, value in lm_total_time_costs.items():
+                unified_time_costs[f"lm_{key}"] = value
+        # Add DiT time costs (if available)
+        dit_time_costs = dit_extra_outputs.get("time_costs", {})
+        if dit_time_costs:
+            for key, value in dit_time_costs.items():
+                unified_time_costs[f"dit_{key}"] = value
+        # Calculate total pipeline time
+        if unified_time_costs:
+            lm_total = unified_time_costs.get("lm_total_time", 0.0)
+            dit_total = unified_time_costs.get("dit_total_time_cost", 0.0)
+            unified_time_costs["pipeline_total_time"] = lm_total + dit_total
+        # Update extra_outputs with unified time_costs
+        extra_outputs["time_costs"] = unified_time_costs
+        if lm_status:
+            status_message = "\n".join(lm_status) + "\n" + status_message
+        else:
+            status_message = status_message
+        # Create and return GenerationResult
         return GenerationResult(
+            audios=audios,
             status_message=status_message,
+            extra_outputs=extra_outputs,
             success=True,
             error=None,
         )
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
+            audios=[],
+            status_message=f"Error: {str(e)}",
+            extra_outputs={},
             success=False,
             error=str(e),
         )

acestep/llm_inference.py CHANGED Viewed

@@ -5,7 +5,8 @@ Handles all LM-related operations including initialization and generation
 import os
 import traceback
 import time
-from typing import Optional, Dict, Any, Tuple, List
 from contextlib import contextmanager
 import yaml
@@ -85,6 +86,189 @@ class LLMHandler:
         except Exception as e:
             return 0.9, False
     def initialize(
         self,
         checkpoint_dir: str,
@@ -126,6 +310,7 @@ class LLMHandler:
             logger.info("loading 5Hz LM tokenizer...")
             start_time = time.time()
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
             logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
             self.llm_tokenizer = llm_tokenizer
@@ -150,41 +335,21 @@ class LLMHandler:
                     # vllm initialization failed, fallback to PyTorch
                     if not self.llm_initialized:
                         logger.warning("vllm initialization failed, falling back to PyTorch backend")
-                        try:
-                            self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                            if not self.offload_to_cpu:
-                                self.llm = self.llm.to(device).to(self.dtype)
-                            else:
-                                self.llm = self.llm.to("cpu").to(self.dtype)
-                            self.llm.eval()
-                            self.llm_backend = "pt"
-                            self.llm_initialized = True
-                            logger.info("5Hz LM initialized successfully using PyTorch backend (fallback)")
-                            status_msg = f"✅ 5Hz LM initialized successfully (PyTorch fallback)\nModel: {full_lm_model_path}\nBackend: PyTorch"
-                        except Exception as e:
-                            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
                 # If vllm initialization succeeded, self.llm_initialized should already be True
             else:
                 # Use PyTorch backend (pt)
-                try:
-                    self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                    if not self.offload_to_cpu:
-                        self.llm = self.llm.to(device).to(self.dtype)
-                    else:
-                        self.llm = self.llm.to("cpu").to(self.dtype)
-                    self.llm.eval()
-                    self.llm_backend = "pt"
-                    self.llm_initialized = True
-                    logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
-                    status_msg = f"✅ 5Hz LM initialized successfully\nModel: {full_lm_model_path}\nBackend: PyTorch\nDevice: {device}"
-                except Exception as e:
-                    return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
             return status_msg, True
         except Exception as e:
-            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return error_msg, False
     def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
         """Initialize 5Hz LM model using vllm backend"""
@@ -230,12 +395,11 @@ class LLMHandler:
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
-            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return error_msg
-    def _run_vllm_from_formatted(
         self,
-        formatted_prompt: str,
         temperature: float,
         cfg_scale: float,
         negative_prompt: str,
@@ -244,7 +408,7 @@ class LLMHandler:
         repetition_penalty: float,
         use_constrained_decoding: bool = True,
         constrained_decoding_debug: bool = False,
-        metadata_temperature: Optional[float] = 0.85,
         codes_temperature: Optional[float] = None,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
@@ -256,37 +420,40 @@ class LLMHandler:
         caption: str = "",
         lyrics: str = "",
         cot_text: str = "",
-    ) -> str:
-        """Shared vllm path: accept prebuilt formatted prompt and return text."""
         from nanovllm import SamplingParams
         # Determine effective temperature for sampler
-        use_phase_temperatures = metadata_temperature is not None or codes_temperature is not None
         effective_sampler_temp = 1.0 if use_phase_temperatures else temperature
-        # Use shared constrained processor if enabled
-        constrained_processor = None
-        if use_constrained_decoding or use_phase_temperatures:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            # Use shared processor, just update caption and settings
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
-            self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
-            self.constrained_processor.set_target_duration(target_duration)
-            # Always call set_user_metadata to ensure previous settings are cleared if None
-            self.constrained_processor.set_user_metadata(user_metadata)
-            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
-            # Set skip_caption and skip_language based on flags
-            self.constrained_processor.set_skip_genres(skip_genres)
-            self.constrained_processor.set_skip_caption(skip_caption)
-            self.constrained_processor.set_skip_language(skip_language)
-            # Set generation phase for phase-aware processing
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
         sampling_params = SamplingParams(
             max_tokens=self.max_model_len - 64,
@@ -301,119 +468,25 @@ class LLMHandler:
         if cfg_scale > 1.0:
             # Build unconditional prompt based on generation phase
-            if generation_phase == "codes":
-                # Codes phase: use empty CoT in unconditional prompt
-                # formatted_prompt was built with build_formatted_prompt_with_cot(caption, lyrics, cot_text)
-                # For unconditional, we use empty CoT: build_formatted_prompt_with_cot(caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=...)
-                formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                    caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
-                )
-            else:
-                # CoT phase: unconditional prompt
-                # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
-                formatted_unconditional_prompt = self.build_formatted_prompt(
-                    caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
-                )
-            outputs = self.llm.generate(
-                [formatted_prompt],
-                sampling_params,
-                unconditional_prompts=[formatted_unconditional_prompt],
-            )
-        else:
-            outputs = self.llm.generate([formatted_prompt], sampling_params)
-        # Extract text (retain original selection order/logic)
-        if isinstance(outputs, list) and len(outputs) > 0:
-            if hasattr(outputs[0], "outputs") and len(outputs[0].outputs) > 0:
-                output_text = outputs[0].outputs[0].text
-            elif hasattr(outputs[0], "text"):
-                output_text = outputs[0].text
-            elif isinstance(outputs[0], dict) and "text" in outputs[0]:
-                output_text = outputs[0]["text"]
-            else:
-                output_text = str(outputs[0])
-        else:
-            output_text = str(outputs)
-        return output_text
-    def _run_vllm_batch(
-        self,
-        formatted_prompts: List[str],
-        temperature: float,
-        cfg_scale: float,
-        negative_prompt: str,
-        top_k: Optional[int],
-        top_p: Optional[float],
-        repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        generation_phase: str = "codes",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
-        seeds: Optional[List[int]] = None,
-    ) -> List[str]:
-        """Batch generation using vllm backend"""
-        from nanovllm import SamplingParams
-        batch_size = len(formatted_prompts)
-        # Determine effective temperature for sampler
-        effective_sampler_temp = temperature
-        # Use shared constrained processor if enabled
-        # Note: vllm batch mode uses same processor for all items
-        constrained_processor = None
-        if use_constrained_decoding:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.metadata_temperature = None
-            self.constrained_processor.codes_temperature = None
-            self.constrained_processor.set_target_duration(target_duration)
-            self.constrained_processor.set_user_metadata(None)
-            self.constrained_processor.set_stop_at_reasoning(False)
-            self.constrained_processor.set_skip_genres(True)
-            self.constrained_processor.set_skip_caption(True)
-            self.constrained_processor.set_skip_language(True)
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
-        # Build sampling params
-        sampling_params = SamplingParams(
-            max_tokens=self.max_model_len - 64,
-            temperature=effective_sampler_temp,
-            cfg_scale=cfg_scale,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            logits_processor=constrained_processor,
-            logits_processor_update_state=constrained_processor.update_state if constrained_processor else None,
-        )
-        # Generate with or without CFG
-        if cfg_scale > 1.0:
-            # Build unconditional prompts
-            formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
             )
             unconditional_prompts = [formatted_unconditional_prompt] * batch_size
             outputs = self.llm.generate(
-                formatted_prompts,
                 sampling_params,
                 unconditional_prompts=unconditional_prompts,
             )
         else:
-            outputs = self.llm.generate(formatted_prompts, sampling_params)
-        # Extract text from each output
         output_texts = []
         for output in outputs:
             if hasattr(output, "outputs") and len(output.outputs) > 0:
@@ -424,70 +497,11 @@ class LLMHandler:
                 output_texts.append(output["text"])
             else:
                 output_texts.append(str(output))
-        return output_texts
-    def _run_pt_batch(
-        self,
-        formatted_prompts: List[str],
-        temperature: float,
-        cfg_scale: float,
-        negative_prompt: str,
-        top_k: Optional[int],
-        top_p: Optional[float],
-        repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        generation_phase: str = "codes",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
-        seeds: Optional[List[int]] = None,
-    ) -> List[str]:
-        """Batch generation using PyTorch backend"""
-        import random
-        batch_size = len(formatted_prompts)
-        output_texts = []
-        # Generate each item sequentially with different seeds
-        # (PyTorch backend doesn't support true batching efficiently)
-        for i, formatted_prompt in enumerate(formatted_prompts):
-            # Set seed for this item if provided
-            if seeds and i < len(seeds):
-                torch.manual_seed(seeds[i])
-                if torch.cuda.is_available():
-                    torch.cuda.manual_seed_all(seeds[i])
-            # Generate using single-item method
-            output_text = self._run_pt_from_formatted(
-                formatted_prompt=formatted_prompt,
-                temperature=temperature,
-                cfg_scale=cfg_scale,
-                negative_prompt=negative_prompt,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                use_constrained_decoding=use_constrained_decoding,
-                constrained_decoding_debug=constrained_decoding_debug,
-                target_duration=target_duration,
-                user_metadata=None,
-                stop_at_reasoning=False,
-                skip_genres=True,
-                skip_caption=True,
-                skip_language=True,
-                generation_phase=generation_phase,
-                caption=caption,
-                lyrics=lyrics,
-                cot_text=cot_text,
-            )
-            output_texts.append(output_text)
-        return output_texts
-    def _run_pt_from_formatted(
         self,
         formatted_prompt: str,
         temperature: float,
@@ -496,20 +510,20 @@ class LLMHandler:
         top_k: Optional[int],
         top_p: Optional[float],
         repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        user_metadata: Optional[Dict[str, Optional[str]]] = None,
-        stop_at_reasoning: bool = False,
-        skip_genres: bool = True,
-        skip_caption: bool = False,
-        skip_language: bool = False,
-        generation_phase: str = "cot",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
     ) -> str:
-        """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
         inputs = self.llm_tokenizer(
             formatted_prompt,
             return_tensors="pt",
@@ -517,27 +531,19 @@ class LLMHandler:
             truncation=True,
         )
-        # Use shared constrained processor if enabled
-        constrained_processor = None
-        if use_constrained_decoding:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            # Use shared processor, just update caption and settings
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.set_target_duration(target_duration)
-            # Always call set_user_metadata to ensure previous settings are cleared if None
-            self.constrained_processor.set_user_metadata(user_metadata)
-            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
-            # Set skip_caption and skip_language based on flags
-            self.constrained_processor.set_skip_genres(skip_genres)
-            self.constrained_processor.set_skip_caption(skip_caption)
-            self.constrained_processor.set_skip_language(skip_language)
-            # Set generation phase for phase-aware processing
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -546,25 +552,18 @@ class LLMHandler:
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
             # Build logits processor list (only for CFG and repetition penalty)
-            logits_processor = LogitsProcessorList()
-            # Add repetition penalty if needed
-            if repetition_penalty != 1.0:
-                logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
             if cfg_scale > 1.0:
                 # Build unconditional prompt based on generation phase
-                if generation_phase == "codes":
-                    # Codes phase: use empty CoT in unconditional prompt
-                    formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                        caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
-                    )
-                else:
-                    # CoT phase: unconditional prompt
-                    # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
-                    formatted_unconditional_prompt = self.build_formatted_prompt(
-                        caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
-                    )
                 # Tokenize both prompts together to ensure same length (with left padding)
                 # Left padding is important for generation tasks
@@ -657,7 +656,101 @@ class LLMHandler:
         output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
         return output_text
     def has_all_metas(self, user_metadata: Optional[Dict[str, Optional[str]]]) -> bool:
         """Check if all required metadata are present."""
         if user_metadata is None:
@@ -705,10 +798,13 @@ class LLMHandler:
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
-        is_format_caption: bool = False,
-    ) -> Tuple[Dict[str, Any], str, str]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
@@ -721,30 +817,67 @@ class LLMHandler:
                            If specified, constrained decoding will inject these values directly.
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
-        """
-        import time
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
-            return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
         metadata = {}
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
-        # Timing variables
         phase1_time = 0.0
         phase2_time = 0.0
         # ========== PHASE 1: CoT Generation ==========
-        # Always generate CoT unless all metadata are user-provided
-        if not has_all_metas or not is_format_caption:
-            logger.info("Phase 1: Generating CoT metadata...")
             phase1_start = time.time()
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
             logger.info(f"generate_with_stop_condition: formatted_prompt={formatted_prompt}")
             # Generate CoT (stop at </think>)
             cot_output_text, status = self.generate_from_formatted_prompt(
@@ -774,23 +907,63 @@ class LLMHandler:
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
-                return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
-            logger.info(f"Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
-            logger.info("Phase 1: Using user-provided metadata (skipping generation)")
             metadata = {k: v for k, v in user_metadata.items() if v is not None}
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
-            status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-            return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
-        logger.info("Phase 2: Generating audio codes...")
         phase2_start = time.time()
         # Format metadata as CoT using YAML (matching training format)
@@ -799,221 +972,163 @@ class LLMHandler:
         # Build formatted prompt with CoT for codes generation phase
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
-        # Generate audio codes
-        codes_output_text, status = self.generate_from_formatted_prompt(
-            formatted_prompt=formatted_prompt_with_cot,
-            cfg={
-                "temperature": temperature,
-                "cfg_scale": cfg_scale,
-                "negative_prompt": negative_prompt,
-                "top_k": top_k,
-                "top_p": top_p,
-                "repetition_penalty": repetition_penalty,
-                "target_duration": target_duration,
-                "user_metadata": None,  # No user metadata injection in Phase 2
-                "skip_caption": True,  # Skip caption since CoT is already included
-                "skip_language": True,  # Skip language since CoT is already included
-                "generation_phase": "codes",
-                # Pass context for building unconditional prompt in codes phase
-                "caption": caption,
-                "lyrics": lyrics,
-                "cot_text": cot_text,
-            },
-            use_constrained_decoding=use_constrained_decoding,
-            constrained_decoding_debug=constrained_decoding_debug,
-            stop_at_reasoning=False,  # Generate codes until EOS
-        )
-        if not codes_output_text:
-            return metadata, "", status
-        phase2_time = time.time() - phase2_start
-        # Parse audio codes from output (metadata should be same as Phase 1)
-        _, audio_codes = self.parse_lm_output(codes_output_text)
-        codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
-        logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
-        status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-        return metadata, audio_codes, status_msg
-    def generate_with_stop_condition_batch(
-        self,
-        caption: str,
-        lyrics: str,
-        batch_size: int,
-        infer_type: str = "llm_dit",
-        temperature: float = 0.85,
-        cfg_scale: float = 1.0,
-        negative_prompt: str = "NO USER INPUT",
-        top_k: Optional[int] = None,
-        top_p: Optional[float] = None,
-        repetition_penalty: float = 1.0,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        user_metadata: Optional[Dict[str, Optional[str]]] = None,
-        use_cot_caption: bool = True,
-        use_cot_language: bool = True,
-        is_format_caption: bool = False,
-        seeds: Optional[List[int]] = None,
-    ) -> Tuple[List[Dict[str, Any]], List[str], str]:
-        """
-        Batch version of generate_with_stop_condition.
-        Generates multiple audio codes with same conditions but different seeds (for diversity).
-        Args:
-            caption: Same caption for all items
-            lyrics: Same lyrics for all items
-            batch_size: Number of items to generate
-            seeds: Optional list of seeds for each batch item (for reproducibility)
-            ... (other args same as generate_with_stop_condition)
-        Returns:
-            Tuple of (metadata_list, audio_codes_list, status_message)
-            - metadata_list: List of metadata dicts (same metadata for all items)
-            - audio_codes_list: List of audio code strings (one per item, different due to sampling)
-            - status_message: Generation status
-        """
-        import random
-        import time
-        infer_type = (infer_type or "").strip().lower()
-        if infer_type not in {"dit", "llm_dit"}:
-            return [], [], f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
-        # Generate seeds if not provided
-        if seeds is None:
-            seeds = [random.randint(0, 2**32 - 1) for _ in range(batch_size)]
-        elif len(seeds) < batch_size:
-            # Pad with random seeds if not enough provided
-            seeds = list(seeds) + [random.randint(0, 2**32 - 1) for _ in range(batch_size - len(seeds))]
-        else:
-            seeds = seeds[:batch_size]  # Truncate if too many
-        # Timing variables
-        phase1_time = 0.0
-        phase2_time = 0.0
-        # ========== PHASE 1: CoT Generation (ONCE for all items) ==========
-        has_all_metas = self.has_all_metas(user_metadata)
-        if not has_all_metas or not is_format_caption:
-            logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
-            phase1_start = time.time()
-            # Generate CoT metadata once (same for all batch items)
-            metadata, _, status = self.generate_with_stop_condition(
-                caption=caption,
-                lyrics=lyrics,
-                infer_type="dit",  # Only generate metadata
-                temperature=temperature,
-                cfg_scale=cfg_scale,
-                negative_prompt=negative_prompt,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
-                target_duration=target_duration,
-                user_metadata=user_metadata,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
             )
-            phase1_time = time.time() - phase1_start
-            if not metadata:
-                return [], [], status
-            logger.info(f"Batch Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
-        else:
-            # Use user-provided metadata
-            logger.info("Batch Phase 1: Using user-provided metadata (skipping generation)")
-            metadata = {k: v for k, v in user_metadata.items() if v is not None}
-        # If infer_type is 'dit', stop here and return only metadata
-        if infer_type == "dit":
-            metadata_list = [metadata.copy() for _ in range(batch_size)]
-            status_msg = f"✅ Generated CoT metadata successfully (batch mode)\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-            return metadata_list, [""] * batch_size, status_msg
-        # ========== PHASE 2: Audio Codes Generation (BATCH) ==========
-        logger.info(f"Batch Phase 2: Generating audio codes for {batch_size} items...")
-        phase2_start = time.time()
-        # Format metadata as CoT
-        cot_text = self._format_metadata_as_cot(metadata)
-        # Build formatted prompt with CoT
-        formatted_prompt = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
-        # Replicate prompt for batch (all items have same prompt, differ by seeds)
-        formatted_prompts = [formatted_prompt] * batch_size
-        # Call backend-specific batch generation
-        try:
-            if self.llm_backend == "vllm":
-                codes_outputs = self._run_vllm_batch(
-                    formatted_prompts=formatted_prompts,
-                    temperature=temperature,
-                    cfg_scale=cfg_scale,
-                    negative_prompt=negative_prompt,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    use_constrained_decoding=use_constrained_decoding,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    target_duration=target_duration,
-                    generation_phase="codes",
-                    caption=caption,
-                    lyrics=lyrics,
-                    cot_text=cot_text,
-                    seeds=seeds,
-                )
-            else:  # pt backend
-                codes_outputs = self._run_pt_batch(
-                    formatted_prompts=formatted_prompts,
-                    temperature=temperature,
-                    cfg_scale=cfg_scale,
-                    negative_prompt=negative_prompt,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    use_constrained_decoding=use_constrained_decoding,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    target_duration=target_duration,
-                    generation_phase="codes",
-                    caption=caption,
-                    lyrics=lyrics,
-                    cot_text=cot_text,
-                    seeds=seeds,
-                )
-        except Exception as e:
-            error_msg = f"❌ Error in batch codes generation: {str(e)}"
-            logger.error(error_msg)
-            return [], [], error_msg
-        # Parse audio codes from each output
-        audio_codes_list = []
-        metadata_list = []
-        for output_text in codes_outputs:
-            _, audio_codes = self.parse_lm_output(output_text)
-            audio_codes_list.append(audio_codes)
-            metadata_list.append(metadata.copy())  # Same metadata for all
-        phase2_time = time.time() - phase2_start
-        # Log results
-        codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
-        logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
-        status_msg = f"✅ Batch generation completed ({batch_size} items)\nPhase 1: CoT metadata\nPhase 2: {sum(codes_counts)} total codes ({codes_counts})\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-        return metadata_list, audio_codes_list, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """
         Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
@@ -1035,7 +1150,7 @@ class LLMHandler:
         if is_negative_prompt:
             # Unconditional prompt for CFG
             # Check if user provided a meaningful negative prompt (not the default)
-            has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
             if generation_phase == "cot":
                 # CoT phase unconditional prompt
@@ -1086,7 +1201,7 @@ class LLMHandler:
         if is_negative_prompt:
             # Unconditional prompt for codes phase
             # Check if user provided a meaningful negative prompt
-            has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
             # Use empty CoT for unconditional
             cot_for_prompt = "<think>\n</think>"
@@ -1369,8 +1484,8 @@ class LLMHandler:
         try:
             if self.llm_backend == "vllm":
-                output_text = self._run_vllm_from_formatted(
-                    formatted_prompt=formatted_prompt,
                     temperature=temperature,
                     cfg_scale=cfg_scale,
                     negative_prompt=negative_prompt,
@@ -1393,8 +1508,8 @@ class LLMHandler:
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
             # PyTorch backend
-            output_text = self._run_pt_from_formatted(
-                formatted_prompt=formatted_prompt,
                 temperature=temperature,
                 cfg_scale=cfg_scale,
                 negative_prompt=negative_prompt,
@@ -1459,26 +1574,12 @@ class LLMHandler:
             eos_token_id = pad_token_id
         # Build logits processor for repetition penalty
-        logits_processor = LogitsProcessorList()
-        if repetition_penalty != 1.0:
-            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass
-                if past_key_values is None:
-                    outputs = model(
-                        input_ids=generated_ids,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
-                else:
-                    outputs = model(
-                        input_ids=generated_ids[:, -1:],
-                        past_key_values=past_key_values,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size, vocab_size]
@@ -1491,41 +1592,18 @@ class LLMHandler:
                 for processor in logits_processor:
                     next_token_logits = processor(generated_ids, next_token_logits)
-                # Apply top-k filtering
-                if top_k is not None and top_k > 0:
-                    indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
-                    next_token_logits[indices_to_remove] = float('-inf')
-                # Apply top-p filtering
-                if top_p is not None and 0.0 < top_p < 1.0:
-                    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
-                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                    sorted_indices_to_remove = cumulative_probs > top_p
-                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                    sorted_indices_to_remove[..., 0] = 0
-                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                    next_token_logits[indices_to_remove] = float('-inf')
                 # Apply temperature and sample
-                if temperature > 0:
-                    next_token_logits = next_token_logits / temperature
-                    probs = torch.softmax(next_token_logits, dim=-1)
-                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-                else:
-                    next_tokens = torch.argmax(next_token_logits, dim=-1)
                 # Update constrained processor state
-                if constrained_processor is not None:
-                    for b in range(next_tokens.shape[0]):
-                        constrained_processor.update_state(next_tokens[b].item())
                 # Check for EOS token
-                should_stop = False
-                if torch.any(next_tokens == eos_token_id):
-                    should_stop = True
-                elif pad_token_id is not None and pad_token_id != eos_token_id:
-                    if torch.any(next_tokens == pad_token_id):
-                        should_stop = True
                 # Append token to sequence
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)
@@ -1601,28 +1679,12 @@ class LLMHandler:
             eos_token_id = pad_token_id
         # Build logits processor for non-CFG operations (repetition penalty, top_k, top_p)
-        logits_processor = LogitsProcessorList()
-        if repetition_penalty != 1.0:
-            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
-                if past_key_values is None:
-                    # First step: full forward pass
-                    outputs = model(
-                        input_ids=generated_ids,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
-                else:
-                    # Subsequent steps: only forward the last token (utilizing KV cache)
-                    outputs = model(
-                        input_ids=generated_ids[:, -1:],
-                        past_key_values=past_key_values,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
@@ -1645,45 +1707,20 @@ class LLMHandler:
                 for processor in logits_processor:
                     cfg_logits = processor(current_input_ids, cfg_logits)
-                # Apply top-k filtering
-                if top_k is not None and top_k > 0:
-                    indices_to_remove = cfg_logits < torch.topk(cfg_logits, top_k)[0][..., -1, None]
-                    cfg_logits[indices_to_remove] = float('-inf')
-                # Apply top-p (nucleus) filtering
-                if top_p is not None and 0.0 < top_p < 1.0:
-                    sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
-                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                    # Remove tokens with cumulative probability above the threshold
-                    sorted_indices_to_remove = cumulative_probs > top_p
-                    # Shift the indices to the right to keep also the first token above the threshold
-                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                    sorted_indices_to_remove[..., 0] = 0
-                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                    cfg_logits[indices_to_remove] = float('-inf')
                 # Apply temperature and sample
-                if temperature > 0:
-                    cfg_logits = cfg_logits / temperature
-                    probs = torch.softmax(cfg_logits, dim=-1)
-                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-                else:
-                    next_tokens = torch.argmax(cfg_logits, dim=-1)
                 # Update constrained processor state AFTER sampling
-                if constrained_processor is not None:
-                    for b in range(next_tokens.shape[0]):
-                        constrained_processor.update_state(next_tokens[b].item())
                 # Check for EOS token in conditional sequences BEFORE unsqueezing
                 # Stop if any conditional sequence generates EOS token
                 # next_tokens shape: [batch_size] (only conditional tokens)
-                should_stop = False
-                if torch.any(next_tokens == eos_token_id):
-                    should_stop = True
-                elif pad_token_id is not None and pad_token_id != eos_token_id:
-                    if torch.any(next_tokens == pad_token_id):
-                        should_stop = True
                 # Apply the same sampled tokens to both conditional and unconditional sequences
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)

 import os
 import traceback
 import time
+import random
+from typing import Optional, Dict, Any, Tuple, List, Union
 from contextlib import contextmanager
 import yaml
         except Exception as e:
             return 0.9, False
+    def _has_meaningful_negative_prompt(self, negative_prompt: str) -> bool:
+        """Check if negative prompt is meaningful (not default/empty)"""
+        return negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
+    def _build_logits_processor(self, repetition_penalty: float) -> LogitsProcessorList:
+        """Build logits processor list with repetition penalty if needed"""
+        logits_processor = LogitsProcessorList()
+        if repetition_penalty != 1.0:
+            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        return logits_processor
+    def _setup_constrained_processor(
+        self,
+        use_constrained_decoding: bool,
+        constrained_decoding_debug: bool,
+        target_duration: Optional[float],
+        user_metadata: Optional[Dict[str, Optional[str]]],
+        stop_at_reasoning: bool,
+        skip_genres: bool,
+        skip_caption: bool,
+        skip_language: bool,
+        generation_phase: str,
+        is_batch: bool = False,
+        metadata_temperature: Optional[float] = None,
+        codes_temperature: Optional[float] = None,
+    ) -> Optional[MetadataConstrainedLogitsProcessor]:
+        """Setup and configure constrained processor for generation"""
+        use_phase_temperatures = not is_batch and (metadata_temperature is not None or codes_temperature is not None)
+        if not use_constrained_decoding and not use_phase_temperatures:
+            return None
+        # Reset processor state for new generation
+        self.constrained_processor.reset()
+        # Use shared processor, just update settings
+        self.constrained_processor.enabled = use_constrained_decoding
+        self.constrained_processor.debug = constrained_decoding_debug
+        # Phase temperatures only supported in single mode
+        if use_phase_temperatures:
+            self.constrained_processor.metadata_temperature = metadata_temperature
+            self.constrained_processor.codes_temperature = codes_temperature
+        else:
+            self.constrained_processor.metadata_temperature = None
+            self.constrained_processor.codes_temperature = None
+        self.constrained_processor.set_target_duration(target_duration)
+        # Batch mode uses default/disabled settings for these options
+        if is_batch:
+            self.constrained_processor.set_user_metadata(None)
+            self.constrained_processor.set_stop_at_reasoning(False)
+            self.constrained_processor.set_skip_genres(True)
+            self.constrained_processor.set_skip_caption(True)
+            self.constrained_processor.set_skip_language(True)
+        else:
+            # Single mode uses provided settings
+            self.constrained_processor.set_user_metadata(user_metadata)
+            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
+            self.constrained_processor.set_skip_genres(skip_genres)
+            self.constrained_processor.set_skip_caption(skip_caption)
+            self.constrained_processor.set_skip_language(skip_language)
+        # Set generation phase for phase-aware processing
+        self.constrained_processor.set_generation_phase(generation_phase)
+        return self.constrained_processor
+    def _build_unconditional_prompt(
+        self,
+        caption: str,
+        lyrics: str,
+        cot_text: str,
+        negative_prompt: str,
+        generation_phase: str,
+        is_batch: bool = False,
+    ) -> str:
+        """Build unconditional prompt for CFG based on generation phase and batch mode"""
+        if is_batch or generation_phase == "codes":
+            # Codes phase or batch mode: use empty CoT in unconditional prompt
+            return self.build_formatted_prompt_with_cot(
+                caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
+            )
+        else:
+            # CoT phase (single mode only): unconditional prompt
+            # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
+            return self.build_formatted_prompt(
+                caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
+            )
+    def _load_pytorch_model(self, model_path: str, device: str) -> Tuple[bool, str]:
+        """Load PyTorch model from path and return (success, status_message)"""
+        try:
+            self.llm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+            if not self.offload_to_cpu:
+                self.llm = self.llm.to(device).to(self.dtype)
+            else:
+                self.llm = self.llm.to("cpu").to(self.dtype)
+            self.llm.eval()
+            self.llm_backend = "pt"
+            self.llm_initialized = True
+            logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
+            status_msg = f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nBackend: PyTorch\nDevice: {device}"
+            return True, status_msg
+        except Exception as e:
+            return False, f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+    def _apply_top_k_filter(self, logits: torch.Tensor, top_k: Optional[int]) -> torch.Tensor:
+        """Apply top-k filtering to logits"""
+        if top_k is not None and top_k > 0:
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = float('-inf')
+        return logits
+    def _apply_top_p_filter(self, logits: torch.Tensor, top_p: Optional[float]) -> torch.Tensor:
+        """Apply top-p (nucleus) filtering to logits"""
+        if top_p is not None and 0.0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = float('-inf')
+        return logits
+    def _sample_tokens(self, logits: torch.Tensor, temperature: float) -> torch.Tensor:
+        """Sample tokens from logits with temperature"""
+        if temperature > 0:
+            logits = logits / temperature
+            probs = torch.softmax(logits, dim=-1)
+            return torch.multinomial(probs, num_samples=1).squeeze(1)
+        else:
+            return torch.argmax(logits, dim=-1)
+    def _check_eos_token(self, tokens: torch.Tensor, eos_token_id: int, pad_token_id: Optional[int]) -> bool:
+        """Check if any token in the batch is EOS or pad token"""
+        if torch.any(tokens == eos_token_id):
+            return True
+        if pad_token_id is not None and pad_token_id != eos_token_id:
+            if torch.any(tokens == pad_token_id):
+                return True
+        return False
+    def _update_constrained_processor_state(self, constrained_processor: Optional[MetadataConstrainedLogitsProcessor], tokens: torch.Tensor):
+        """Update constrained processor state with generated tokens"""
+        if constrained_processor is not None:
+            for b in range(tokens.shape[0]):
+                constrained_processor.update_state(tokens[b].item())
+    def _forward_pass(
+        self,
+        model: Any,
+        generated_ids: torch.Tensor,
+        model_kwargs: Dict[str, Any],
+        past_key_values: Optional[Any],
+        use_cache: bool,
+    ) -> Any:
+        """Perform forward pass with KV cache support"""
+        if past_key_values is None:
+            outputs = model(
+                input_ids=generated_ids,
+                **model_kwargs,
+                use_cache=use_cache,
+            )
+        else:
+            outputs = model(
+                input_ids=generated_ids[:, -1:],
+                past_key_values=past_key_values,
+                **model_kwargs,
+                use_cache=use_cache,
+            )
+        return outputs
+    def _normalize_batch_input(self, formatted_prompts: Union[str, List[str]]) -> Tuple[List[str], bool]:
+        """Normalize batch input: convert single string to list and return (list, is_batch)"""
+        is_batch = isinstance(formatted_prompts, list)
+        if is_batch:
+            return formatted_prompts, is_batch
+        else:
+            return [formatted_prompts], is_batch
     def initialize(
         self,
         checkpoint_dir: str,
             logger.info("loading 5Hz LM tokenizer...")
             start_time = time.time()
+            # TODO: load tokenizer too slow, not found solution yet
             llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
             logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
             self.llm_tokenizer = llm_tokenizer
                     # vllm initialization failed, fallback to PyTorch
                     if not self.llm_initialized:
                         logger.warning("vllm initialization failed, falling back to PyTorch backend")
+                        success, status_msg = self._load_pytorch_model(full_lm_model_path, device)
+                        if not success:
+                            return status_msg, False
+                        status_msg = f"✅ 5Hz LM initialized successfully (PyTorch fallback)\nModel: {full_lm_model_path}\nBackend: PyTorch"
                 # If vllm initialization succeeded, self.llm_initialized should already be True
             else:
                 # Use PyTorch backend (pt)
+                success, status_msg = self._load_pytorch_model(full_lm_model_path, device)
+                if not success:
+                    return status_msg, False
             return status_msg, True
         except Exception as e:
+            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
     def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
         """Initialize 5Hz LM model using vllm backend"""
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
+            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+    def _run_vllm(
         self,
+        formatted_prompts: Union[str, List[str]],
         temperature: float,
         cfg_scale: float,
         negative_prompt: str,
         repetition_penalty: float,
         use_constrained_decoding: bool = True,
         constrained_decoding_debug: bool = False,
+        metadata_temperature: Optional[float] = None,
         codes_temperature: Optional[float] = None,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         caption: str = "",
         lyrics: str = "",
         cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> Union[str, List[str]]:
+        """
+        Unified vllm generation function supporting both single and batch modes.
+        Accepts either a single formatted prompt (str) or a list of formatted prompts (List[str]).
+        Returns a single string for single mode, or a list of strings for batch mode.
+        """
         from nanovllm import SamplingParams
+        # Determine if batch mode
+        formatted_prompt_list, is_batch = self._normalize_batch_input(formatted_prompts)
+        batch_size = len(formatted_prompt_list)
         # Determine effective temperature for sampler
+        # Batch mode doesn't support phase temperatures, so use simple temperature
+        # Single mode supports phase temperatures
+        use_phase_temperatures = not is_batch and (metadata_temperature is not None or codes_temperature is not None)
         effective_sampler_temp = 1.0 if use_phase_temperatures else temperature
+        # Setup constrained processor
+        constrained_processor = self._setup_constrained_processor(
+            use_constrained_decoding=use_constrained_decoding or use_phase_temperatures,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            is_batch=is_batch,
+            metadata_temperature=metadata_temperature,
+            codes_temperature=codes_temperature,
+        )
         sampling_params = SamplingParams(
             max_tokens=self.max_model_len - 64,
         if cfg_scale > 1.0:
             # Build unconditional prompt based on generation phase
+            formatted_unconditional_prompt = self._build_unconditional_prompt(
+                caption=caption,
+                lyrics=lyrics,
+                cot_text=cot_text,
+                negative_prompt=negative_prompt,
+                generation_phase=generation_phase,
+                is_batch=is_batch,
             )
             unconditional_prompts = [formatted_unconditional_prompt] * batch_size
             outputs = self.llm.generate(
+                formatted_prompt_list,
                 sampling_params,
                 unconditional_prompts=unconditional_prompts,
             )
         else:
+            outputs = self.llm.generate(formatted_prompt_list, sampling_params)
+        # Extract text from outputs
         output_texts = []
         for output in outputs:
             if hasattr(output, "outputs") and len(output.outputs) > 0:
                 output_texts.append(output["text"])
             else:
                 output_texts.append(str(output))
+        # Return single string for single mode, list for batch mode
+        return output_texts[0] if not is_batch else output_texts
+    def _run_pt_single(
         self,
         formatted_prompt: str,
         temperature: float,
         top_k: Optional[int],
         top_p: Optional[float],
         repetition_penalty: float,
+        use_constrained_decoding: bool,
+        constrained_decoding_debug: bool,
+        target_duration: Optional[float],
+        user_metadata: Optional[Dict[str, Optional[str]]],
+        stop_at_reasoning: bool,
+        skip_genres: bool,
+        skip_caption: bool,
+        skip_language: bool,
+        generation_phase: str,
+        caption: str,
+        lyrics: str,
+        cot_text: str,
     ) -> str:
+        """Internal helper function for single-item PyTorch generation."""
         inputs = self.llm_tokenizer(
             formatted_prompt,
             return_tensors="pt",
             truncation=True,
         )
+        # Setup constrained processor
+        constrained_processor = self._setup_constrained_processor(
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            is_batch=False,
+        )
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
             # Build logits processor list (only for CFG and repetition penalty)
+            logits_processor = self._build_logits_processor(repetition_penalty)
             if cfg_scale > 1.0:
                 # Build unconditional prompt based on generation phase
+                formatted_unconditional_prompt = self._build_unconditional_prompt(
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                    negative_prompt=negative_prompt,
+                    generation_phase=generation_phase,
+                    is_batch=False,
+                )
                 # Tokenize both prompts together to ensure same length (with left padding)
                 # Left padding is important for generation tasks
         output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
         return output_text
+    def _run_pt(
+        self,
+        formatted_prompts: Union[str, List[str]],
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+        target_duration: Optional[float] = None,
+        user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        stop_at_reasoning: bool = False,
+        skip_genres: bool = True,
+        skip_caption: bool = False,
+        skip_language: bool = False,
+        generation_phase: str = "cot",
+        caption: str = "",
+        lyrics: str = "",
+        cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> Union[str, List[str]]:
+        """
+        Unified PyTorch generation function supporting both single and batch modes.
+        Accepts either a single formatted prompt (str) or a list of formatted prompts (List[str]).
+        Returns a single string for single mode, or a list of strings for batch mode.
+        Note: PyTorch backend processes batch items sequentially (doesn't support true batching efficiently).
+        """
+        # Determine if batch mode
+        formatted_prompt_list, is_batch = self._normalize_batch_input(formatted_prompts)
+        # For batch mode, process each item sequentially with different seeds
+        if is_batch:
+            output_texts = []
+            for i, formatted_prompt in enumerate(formatted_prompt_list):
+                # Set seed for this item if provided
+                if seeds and i < len(seeds):
+                    torch.manual_seed(seeds[i])
+                    if torch.cuda.is_available():
+                        torch.cuda.manual_seed_all(seeds[i])
+                # Generate using single-item method with batch-mode defaults
+                output_text = self._run_pt_single(
+                    formatted_prompt=formatted_prompt,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    negative_prompt=negative_prompt,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    use_constrained_decoding=use_constrained_decoding,
+                    constrained_decoding_debug=constrained_decoding_debug,
+                    target_duration=target_duration,
+                    user_metadata=None,
+                    stop_at_reasoning=False,
+                    skip_genres=True,
+                    skip_caption=True,
+                    skip_language=True,
+                    generation_phase=generation_phase,
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                )
+                output_texts.append(output_text)
+            return output_texts
+        # Single mode: process the formatted prompt
+        formatted_prompt = formatted_prompt_list[0]
+        return self._run_pt_single(
+            formatted_prompt=formatted_prompt,
+            temperature=temperature,
+            cfg_scale=cfg_scale,
+            negative_prompt=negative_prompt,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            caption=caption,
+            lyrics=lyrics,
+            cot_text=cot_text,
+        )
     def has_all_metas(self, user_metadata: Optional[Dict[str, Optional[str]]]) -> bool:
         """Check if all required metadata are present."""
         if user_metadata is None:
         constrained_decoding_debug: bool = False,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        use_cot_metas: bool = True,
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
+        batch_size: Optional[int] = None,
+        seeds: Optional[List[int]] = None,
+        progress=None,
+    ) -> Dict[str, Any]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
                            If specified, constrained decoding will inject these values directly.
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
+            batch_size: Optional batch size for batch generation. If None or 1, returns single result.
+                       If > 1, returns batch results (lists).
+            seeds: Optional list of seeds for batch generation (for reproducibility).
+                  Only used when batch_size > 1. TODO: not used yet
+        Returns:
+            Dictionary containing:
+                - metadata: Dict or List[Dict] - Generated metadata
+                - audio_codes: str or List[str] - Generated audio codes
+                - success: bool - Whether generation succeeded
+                - error: Optional[str] - Error message if failed
+                - extra_outputs: Dict with time_costs and other info
+        """
+        if progress is None:
+            def progress(*args, **kwargs):
+                pass
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
+            error_msg = f"invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
+            return {
+                "metadata": [] if (batch_size and batch_size > 1) else {},
+                "audio_codes": [] if (batch_size and batch_size > 1) else "",
+                "success": False,
+                "error": error_msg,
+                "extra_outputs": {"time_costs": {}},
+            }
+        # Determine if batch mode
+        is_batch = batch_size and batch_size > 1
+        actual_batch_size = batch_size if is_batch else 1
+        # Initialize variables
         metadata = {}
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
         phase1_time = 0.0
         phase2_time = 0.0
+        # Handle seeds for batch mode
+        if is_batch:
+            if seeds is None:
+                seeds = [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size)]
+            elif len(seeds) < actual_batch_size:
+                seeds = list(seeds) + [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size - len(seeds))]
+            else:
+                seeds = seeds[:actual_batch_size]
         # ========== PHASE 1: CoT Generation ==========
+        # Skip CoT if all metadata are user-provided OR caption is already formatted
+        progress(0.1, f"Phase 1: Generating CoT metadata (once for all items)...")
+        if not has_all_metas and use_cot_metas:
+            if is_batch:
+                logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
+            else:
+                logger.info("Phase 1: Generating CoT metadata...")
             phase1_start = time.time()
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
             logger.info(f"generate_with_stop_condition: formatted_prompt={formatted_prompt}")
             # Generate CoT (stop at </think>)
             cot_output_text, status = self.generate_from_formatted_prompt(
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
+                return {
+                    "metadata": [] if is_batch else {},
+                    "audio_codes": [] if is_batch else "",
+                    "success": False,
+                    "error": status,
+                    "extra_outputs": {"time_costs": {"phase1_time": phase1_time}},
+                }
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
+            if is_batch:
+                logger.info(f"Batch Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
+            else:
+                logger.info(f"Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
+            if is_batch:
+                logger.info("Batch Phase 1: Using user-provided metadata (skipping generation)")
+            else:
+                logger.info("Phase 1: Using user-provided metadata (skipping generation)")
             metadata = {k: v for k, v in user_metadata.items() if v is not None}
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
+            if is_batch:
+                metadata_list = [metadata.copy() for _ in range(actual_batch_size)]
+                return {
+                    "metadata": metadata_list,
+                    "audio_codes": [""] * actual_batch_size,
+                    "success": True,
+                    "error": None,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
+            else:
+                return {
+                    "metadata": metadata,
+                    "audio_codes": "",
+                    "success": True,
+                    "error": None,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
         # ========== PHASE 2: Audio Codes Generation ==========
+        if is_batch:
+            logger.info(f"Batch Phase 2: Generating audio codes for {actual_batch_size} items...")
+        else:
+            logger.info("Phase 2: Generating audio codes...")
         phase2_start = time.time()
         # Format metadata as CoT using YAML (matching training format)
         # Build formatted prompt with CoT for codes generation phase
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
+        progress(0.5, f"Phase 2: Generating audio codes for {actual_batch_size} items...")
+        if is_batch:
+            # Batch mode: generate codes for all items
+            formatted_prompts = [formatted_prompt_with_cot] * actual_batch_size
+            # Call backend-specific batch generation
+            try:
+                if self.llm_backend == "vllm":
+                    codes_outputs = self._run_vllm(
+                        formatted_prompts=formatted_prompts,
+                        temperature=temperature,
+                        cfg_scale=cfg_scale,
+                        negative_prompt=negative_prompt,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        use_constrained_decoding=use_constrained_decoding,
+                        constrained_decoding_debug=constrained_decoding_debug,
+                        target_duration=target_duration,
+                        generation_phase="codes",
+                        caption=caption,
+                        lyrics=lyrics,
+                        cot_text=cot_text,
+                        seeds=seeds,
+                    )
+                else:  # pt backend
+                    codes_outputs = self._run_pt(
+                        formatted_prompts=formatted_prompts,
+                        temperature=temperature,
+                        cfg_scale=cfg_scale,
+                        negative_prompt=negative_prompt,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        use_constrained_decoding=use_constrained_decoding,
+                        constrained_decoding_debug=constrained_decoding_debug,
+                        target_duration=target_duration,
+                        generation_phase="codes",
+                        caption=caption,
+                        lyrics=lyrics,
+                        cot_text=cot_text,
+                        seeds=seeds,
+                    )
+            except Exception as e:
+                error_msg = f"Error in batch codes generation: {str(e)}"
+                logger.error(error_msg)
+                return {
+                    "metadata": [],
+                    "audio_codes": [],
+                    "success": False,
+                    "error": error_msg,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "phase2_time": 0.0,
+                            "total_time": phase1_time,
+                        }
+                    },
+                }
+            # Parse audio codes from each output
+            audio_codes_list = []
+            metadata_list = []
+            for output_text in codes_outputs:
+                _, audio_codes_item = self.parse_lm_output(output_text)
+                audio_codes_list.append(audio_codes_item)
+                metadata_list.append(metadata.copy())  # Same metadata for all
+            phase2_time = time.time() - phase2_start
+            # Log results
+            codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
+            logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
+            total_time = phase1_time + phase2_time
+            return {
+                "metadata": metadata_list,
+                "audio_codes": audio_codes_list,
+                "success": True,
+                "error": None,
+                "extra_outputs": {
+                    "time_costs": {
+                        "phase1_time": phase1_time,
+                        "phase2_time": phase2_time,
+                        "total_time": total_time,
+                    },
+                    "codes_counts": codes_counts,
+                    "total_codes": sum(codes_counts),
+                },
+            }
+        else:
+            # Single mode: generate codes for one item
+            codes_output_text, status = self.generate_from_formatted_prompt(
+                formatted_prompt=formatted_prompt_with_cot,
+                cfg={
+                    "temperature": temperature,
+                    "cfg_scale": cfg_scale,
+                    "negative_prompt": negative_prompt,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                    "repetition_penalty": repetition_penalty,
+                    "target_duration": target_duration,
+                    "user_metadata": None,  # No user metadata injection in Phase 2
+                    "skip_caption": True,  # Skip caption since CoT is already included
+                    "skip_language": True,  # Skip language since CoT is already included
+                    "generation_phase": "codes",
+                    # Pass context for building unconditional prompt in codes phase
+                    "caption": caption,
+                    "lyrics": lyrics,
+                    "cot_text": cot_text,
+                },
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
+                stop_at_reasoning=False,  # Generate codes until EOS
             )
+            if not codes_output_text:
+                total_time = phase1_time + phase2_time
+                return {
+                    "metadata": metadata,
+                    "audio_codes": "",
+                    "success": False,
+                    "error": status,
+                    "extra_outputs": {
+                        "time_costs": {
+                            "phase1_time": phase1_time,
+                            "phase2_time": phase2_time,
+                            "total_time": total_time,
+                        }
+                    },
+                }
+            phase2_time = time.time() - phase2_start
+            # Parse audio codes from output (metadata should be same as Phase 1)
+            _, audio_codes = self.parse_lm_output(codes_output_text)
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
+            total_time = phase1_time + phase2_time
+            return {
+                "metadata": metadata,
+                "audio_codes": audio_codes,
+                "success": True,
+                "error": None,
+                "extra_outputs": {
+                    "time_costs": {
+                        "phase1_time": phase1_time,
+                        "phase2_time": phase2_time,
+                        "total_time": total_time,
+                    },
+                    "codes_count": codes_count,
+                },
+            }
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """
         Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
         if is_negative_prompt:
             # Unconditional prompt for CFG
             # Check if user provided a meaningful negative prompt (not the default)
+            has_negative_prompt = self._has_meaningful_negative_prompt(negative_prompt)
             if generation_phase == "cot":
                 # CoT phase unconditional prompt
         if is_negative_prompt:
             # Unconditional prompt for codes phase
             # Check if user provided a meaningful negative prompt
+            has_negative_prompt = self._has_meaningful_negative_prompt(negative_prompt)
             # Use empty CoT for unconditional
             cot_for_prompt = "<think>\n</think>"
         try:
             if self.llm_backend == "vllm":
+                output_text = self._run_vllm(
+                    formatted_prompts=formatted_prompt,
                     temperature=temperature,
                     cfg_scale=cfg_scale,
                     negative_prompt=negative_prompt,
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
             # PyTorch backend
+            output_text = self._run_pt(
+                formatted_prompts=formatted_prompt,
                 temperature=temperature,
                 cfg_scale=cfg_scale,
                 negative_prompt=negative_prompt,
             eos_token_id = pad_token_id
         # Build logits processor for repetition penalty
+        logits_processor = self._build_logits_processor(repetition_penalty)
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass
+                outputs = self._forward_pass(model, generated_ids, model_kwargs, past_key_values, use_cache)
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size, vocab_size]
                 for processor in logits_processor:
                     next_token_logits = processor(generated_ids, next_token_logits)
+                # Apply top-k and top-p filtering
+                next_token_logits = self._apply_top_k_filter(next_token_logits, top_k)
+                next_token_logits = self._apply_top_p_filter(next_token_logits, top_p)
                 # Apply temperature and sample
+                next_tokens = self._sample_tokens(next_token_logits, temperature)
                 # Update constrained processor state
+                self._update_constrained_processor_state(constrained_processor, next_tokens)
                 # Check for EOS token
+                should_stop = self._check_eos_token(next_tokens, eos_token_id, pad_token_id)
                 # Append token to sequence
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)
             eos_token_id = pad_token_id
         # Build logits processor for non-CFG operations (repetition penalty, top_k, top_p)
+        logits_processor = self._build_logits_processor(repetition_penalty)
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
+                outputs = self._forward_pass(model, generated_ids, model_kwargs, past_key_values, use_cache)
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
                 for processor in logits_processor:
                     cfg_logits = processor(current_input_ids, cfg_logits)
+                # Apply top-k and top-p filtering
+                cfg_logits = self._apply_top_k_filter(cfg_logits, top_k)
+                cfg_logits = self._apply_top_p_filter(cfg_logits, top_p)
                 # Apply temperature and sample
+                next_tokens = self._sample_tokens(cfg_logits, temperature)
                 # Update constrained processor state AFTER sampling
+                self._update_constrained_processor_state(constrained_processor, next_tokens)
                 # Check for EOS token in conditional sequences BEFORE unsqueezing
                 # Stop if any conditional sequence generates EOS token
                 # next_tokens shape: [batch_size] (only conditional tokens)
+                should_stop = self._check_eos_token(next_tokens, eos_token_id, pad_token_id)
                 # Apply the same sampled tokens to both conditional and unconditional sequences
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -68,10 +68,16 @@ class ModelRunner:
         self.model = Qwen3ForCausalLM(hf_config)
         load_model(self.model, config.model)
         self.sampler = Sampler()
         self.warmup_model()
         self.allocate_kv_cache()
         if not self.enforce_eager:
             self.capture_cudagraph()
         torch.set_default_device("cpu")
         torch.set_default_dtype(default_dtype)
@@ -84,6 +90,39 @@ class ModelRunner:
                 self.shm = SharedMemory(name="nanovllm")
                 self.loop()
     def exit(self):
         if self.world_size > 1:
             self.shm.close()
@@ -203,7 +242,7 @@ class ModelRunner:
                 if i != seq.num_blocks - 1:
                     end = start + self.block_size
                 else:
-                    end = start + seq.last_block_num_tokens
                 slot_mapping.extend(list(range(start, end)))
         if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
             block_tables = self.prepare_block_tables(seqs)
@@ -216,57 +255,58 @@ class ModelRunner:
         return input_ids, positions
     def prepare_decode(self, seqs: list[Sequence]):
-        input_ids = []
-        positions = []
-        slot_mapping = []
-        context_lens = []
-        for seq in seqs:
-            input_ids.append(seq.last_token)
-            positions.append(len(seq) - 1)
-            context_lens.append(len(seq))
-            slot_mapping.append(seq.block_table[-1] * self.block_size + seq.last_block_num_tokens  - 1)
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
-        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
-        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
-        context_lens = torch.tensor(context_lens, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
         block_tables = self.prepare_block_tables(seqs)
         set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
         return input_ids, positions
     def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
-        """Prepare sampling parameters. For CFG batch, only return parameters for conditional sequences."""
         if is_cfg_batch:
-            # For CFG batch, seqs contains [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
-            # We only need parameters for conditional sequences (first half)
-            num_cond = len(seqs) // 2
-            temperatures = []
-            cfg_scales = []
-            top_ks = []
-            top_ps = []
-            repetition_penalties = []
-            for seq in seqs[:num_cond]:
-                temperatures.append(seq.temperature)
-                cfg_scales.append(seq.cfg_scale)
-                top_ks.append(seq.top_k if seq.top_k is not None else 0)
-                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
-                repetition_penalties.append(seq.repetition_penalty)
         else:
-            temperatures = []
-            cfg_scales = []
-            top_ks = []
-            top_ps = []
-            repetition_penalties = []
-            for seq in seqs:
-                temperatures.append(seq.temperature)
-                cfg_scales.append(seq.cfg_scale)
-                top_ks.append(seq.top_k if seq.top_k is not None else 0)
-                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
-                repetition_penalties.append(seq.repetition_penalty)
-        temperatures = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        cfg_scales = torch.tensor(cfg_scales, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        top_ks = torch.tensor(top_ks, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
-        top_ps = torch.tensor(top_ps, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        repetition_penalties = torch.tensor(repetition_penalties, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
     @torch.inference_mode()
@@ -293,27 +333,15 @@ class ModelRunner:
         [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
         where uncond_seqi is the paired unconditional sequence of cond_seqi."""
         # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
-        is_cfg_batch = False
-        if len(seqs) > 0:
-            # CFG batch if first sequence has cfg_scale > 1.0 and paired_seq
-            if seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
-                is_cfg_batch = True
-                # Verify batch structure: first half conditional, second half unconditional
-                num_cond = len(seqs) // 2
-                for i in range(num_cond):
-                    if seqs[i].is_unconditional or seqs[i + num_cond].is_unconditional == False:
-                        is_cfg_batch = False
-                        break
         if is_cfg_batch:
             # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
             num_cond = len(seqs) // 2
             cond_seqs = seqs[:num_cond]
-            uncond_seqs = seqs[num_cond:]
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
-            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
-                                   else self.prepare_decode(seqs))
             sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
             if sample_params is not None:
                 temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
@@ -364,7 +392,7 @@ class ModelRunner:
                         logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
                 # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
-                cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
                 # Sample from CFG logits
                 token_ids_cfg = self.sampler(
@@ -373,7 +401,7 @@ class ModelRunner:
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
-                    input_ids=cond_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling
@@ -432,7 +460,7 @@ class ModelRunner:
                         logits[i] = processed[0]
                 # Prepare input_ids for sampler
-                seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
                 token_ids = self.sampler(
                     logits,
@@ -440,7 +468,7 @@ class ModelRunner:
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
-                    input_ids=seq_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling

         self.model = Qwen3ForCausalLM(hf_config)
         load_model(self.model, config.model)
         self.sampler = Sampler()
+        # Pre-allocate buffers for sampling (optimization: avoid repeated tensor creation)
+        # Must be called before warmup_model() since it uses these buffers
+        self._allocate_sample_buffers()
         self.warmup_model()
         self.allocate_kv_cache()
         if not self.enforce_eager:
             self.capture_cudagraph()
         torch.set_default_device("cpu")
         torch.set_default_dtype(default_dtype)
                 self.shm = SharedMemory(name="nanovllm")
                 self.loop()
+    def _allocate_sample_buffers(self):
+        """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
+        max_bs = self.config.max_num_seqs
+        max_tokens = self.config.max_num_batched_tokens
+        max_num_blocks = (self.config.max_model_len + self.block_size - 1) // self.block_size
+        # Pre-allocate pinned memory buffers on CPU for fast transfer
+        # Must explicitly specify device="cpu" since default device may be "cuda"
+        self._cpu_temperatures = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_cfg_scales = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_top_ks = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_top_ps = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_repetition_penalties = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        # Pre-allocate decode buffers on CPU with pinned memory
+        self._cpu_input_ids = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate prefill buffers on CPU with pinned memory (optimization to avoid repeated tensor creation)
+        self._cpu_prefill_input_ids = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_positions = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_cu_seqlens = torch.zeros(max_bs + 1, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_prefill_slot_mapping = torch.zeros(max_tokens, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate block tables buffer (shared by both decode and prefill)
+        self._cpu_block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate buffer for sequence token IDs (used in logits processor and sampler)
+        # Max length is max_model_len since sequences can be that long
+        self._seq_token_ids_buffer = torch.zeros(max_bs, self.config.max_model_len, dtype=torch.int64, device="cpu", pin_memory=True)
     def exit(self):
         if self.world_size > 1:
             self.shm.close()
                 if i != seq.num_blocks - 1:
                     end = start + self.block_size
                 else:
+                    end = start + seq.last_block_num_tokens
                 slot_mapping.extend(list(range(start, end)))
         if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
             block_tables = self.prepare_block_tables(seqs)
         return input_ids, positions
     def prepare_decode(self, seqs: list[Sequence]):
+        """Optimized decode preparation using pre-allocated buffers."""
+        bs = len(seqs)
+        # Use pre-allocated CPU buffers
+        for i, seq in enumerate(seqs):
+            self._cpu_input_ids[i] = seq.last_token
+            self._cpu_positions[i] = len(seq) - 1
+            self._cpu_context_lens[i] = len(seq)
+            self._cpu_slot_mapping[i] = seq.block_table[-1] * self.block_size + seq.last_block_num_tokens - 1
+        # Transfer to GPU using sliced views
+        input_ids = self._cpu_input_ids[:bs].cuda(non_blocking=True)
+        positions = self._cpu_positions[:bs].cuda(non_blocking=True)
+        slot_mapping = self._cpu_slot_mapping[:bs].cuda(non_blocking=True)
+        context_lens = self._cpu_context_lens[:bs].cuda(non_blocking=True)
         block_tables = self.prepare_block_tables(seqs)
         set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
         return input_ids, positions
     def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
+        """Optimized sample preparation using pre-allocated buffers."""
         if is_cfg_batch:
+            num_seqs = len(seqs) // 2
+            target_seqs = seqs[:num_seqs]
         else:
+            num_seqs = len(seqs)
+            target_seqs = seqs
+        # Fill pre-allocated CPU buffers
+        top_ks_is_zero = True
+        top_ps_is_one = True
+        repetition_penalties_is_one = True
+        for i, seq in enumerate(target_seqs):
+            self._cpu_temperatures[i] = seq.temperature
+            self._cpu_cfg_scales[i] = seq.cfg_scale
+            self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
+            if seq.top_k is not None and seq.top_k > 0:
+                top_ks_is_zero = False
+            self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
+            if seq.top_p is not None and seq.top_p == 1.0:
+                top_ps_is_one = False
+            self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
+            if seq.repetition_penalty is not None and seq.repetition_penalty == 1.0:
+                repetition_penalties_is_one = False
+        # Transfer to GPU using sliced views (single batched transfer)
+        temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
+        cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
+        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True) if not top_ks_is_zero else None
+        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True) if not top_ps_is_one else None
+        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True) if not repetition_penalties_is_one else None
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
     @torch.inference_mode()
         [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
         where uncond_seqi is the paired unconditional sequence of cond_seqi."""
         # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
+        is_cfg_batch = seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None
         if is_cfg_batch:
             # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
             num_cond = len(seqs) // 2
             cond_seqs = seqs[:num_cond]
+            # uncond_seqs = seqs[num_cond:]
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs))
             sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
             if sample_params is not None:
                 temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
                         logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
                 # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
+                # cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
                 # Sample from CFG logits
                 token_ids_cfg = self.sampler(
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
+                    # input_ids=cond_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling
                         logits[i] = processed[0]
                 # Prepare input_ids for sampler
+                # seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
                 token_ids = self.sampler(
                     logits,
                     top_ks=top_ks if top_ks is not None else None,
                     top_ps=top_ps if top_ps is not None else None,
                     repetition_penalties=None,  # Already applied above
+                    # input_ids=seq_input_ids,
                 ).tolist()
                 # Update logits processor state after sampling

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py CHANGED Viewed

@@ -3,6 +3,83 @@ from torch import nn
 from typing import Optional
 class Sampler(nn.Module):
     def __init__(self):
@@ -19,56 +96,19 @@ class Sampler(nn.Module):
         input_ids: Optional[torch.Tensor] = None,
     ):
         """
-        Sample tokens from logits with optional top-k, top-p, and repetition penalty.
-        Args:
-            logits: [batch_size, vocab_size] logits tensor
-            temperatures: [batch_size] temperature values
-            top_ks: Optional [batch_size] top-k values (None or 0 means no top-k filtering)
-            top_ps: Optional [batch_size] top-p values (None or 1.0 means no top-p filtering)
-            repetition_penalties: Optional [batch_size] repetition penalty values (1.0 means no penalty)
-            input_ids: Optional [batch_size, seq_len] input token ids for repetition penalty
         """
-        batch_size, vocab_size = logits.shape
-        # Note: Repetition penalty is applied in ModelRunner before calling sampler
-        # This allows us to use the full sequence context
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
-        # Apply top-k filtering if specified
-        if top_ks is not None:
-            for i in range(batch_size):
-                top_k = top_ks[i].item()
-                if top_k > 0 and top_k < vocab_size:
-                    # Get top-k logits, set others to -inf
-                    top_k_logits, top_k_indices = torch.topk(logits[i], int(top_k), dim=-1)
-                    filtered_logits = torch.full_like(logits[i], float('-inf'))
-                    filtered_logits[top_k_indices] = top_k_logits
-                    logits[i] = filtered_logits
-        # Apply top-p (nucleus) filtering if specified
-        if top_ps is not None:
-            probs = torch.softmax(logits, dim=-1)
-            for i in range(batch_size):
-                top_p = top_ps[i].item()
-                if 0.0 < top_p < 1.0:
-                    # Sort probabilities in descending order
-                    sorted_probs, sorted_indices = torch.sort(probs[i], descending=True)
-                    # Calculate cumulative probabilities
-                    cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
-                    # Find the cutoff point
-                    cutoff_idx = (cumsum_probs <= top_p).sum().item()
-                    if cutoff_idx < len(sorted_indices):
-                        cutoff_idx += 1  # Include one more token to ensure we have at least one
-                    # Create mask for tokens to keep
-                    mask = torch.zeros_like(probs[i])
-                    mask[sorted_indices[:cutoff_idx]] = 1.0
-                    # Apply mask: set filtered tokens to -inf
-                    logits[i] = torch.where(mask > 0, logits[i], torch.tensor(float('-inf'), device=logits.device))
-        # Sample using Gumbel-max trick (equivalent to sampling from softmax)
         probs = torch.softmax(logits, dim=-1)
         sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
-        return sample_tokens

 from typing import Optional
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits (vLLM style).
+    The logits tensor is updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+        # Avoid sorting vocab for top-k only case
+        return apply_top_k_only(logits, k)
+    # Need to sort for top-p
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+    if k is not None:
+        # Apply top-k first
+        vocab_size = logits_sort.size(1)
+        # Clamp k to valid range
+        k_clamped = k.clamp(1, vocab_size).long()
+        top_k_mask_idx = vocab_size - k_clamped  # shape: [B]
+        # Get the threshold value for each batch
+        top_k_thresh = logits_sort.gather(1, top_k_mask_idx.unsqueeze(1))
+        top_k_mask = logits_sort < top_k_thresh
+        logits_sort.masked_fill_(top_k_mask, float('-inf'))
+    # Apply top-p
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)  # reuse buffer
+    top_p_mask = probs_sum <= (1.0 - p.unsqueeze(1))
+    # Ensure at least one token is kept
+    top_p_mask[:, -1] = False
+    logits_sort.masked_fill_(top_p_mask, float('-inf'))
+    # Re-sort back to original positions
+    logits.scatter_(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """Apply top-k mask without sorting the entire vocab (vLLM style).
+    This is much faster than sorting for top-k only cases.
+    The logits tensor is updated in-place.
+    """
+    vocab_size = logits.shape[1]
+    # Handle cases where k >= vocab_size (no filtering needed)
+    no_top_k_mask = (k <= 0) | (k >= vocab_size)
+    # Set invalid k to 1 so we can still gather
+    k_safe = k.masked_fill(no_top_k_mask, 1).long()
+    # NOTE: This int() causes CPU-GPU sync, but torch.topk requires Python int
+    max_top_k = int(k_safe.max().clamp(max=vocab_size))
+    # Get top-k values for all batches
+    # topk.values has shape [batch_size, max_top_k]
+    topk_values = logits.topk(max_top_k, dim=1).values
+    # Convert k to 0-based index: we want the k-th largest value (index k-1)
+    # Clamp to valid range for gather
+    k_index = (k_safe - 1).clamp(0, max_top_k - 1).unsqueeze(1)  # shape: [B, 1]
+    # Gather the threshold value (the k-th largest)
+    top_k_thresh = topk_values.gather(1, k_index)
+    # For rows with no top-k filtering, set threshold to -inf so nothing gets masked
+    top_k_thresh.masked_fill_(no_top_k_mask.unsqueeze(1), float('-inf'))
+    # Mask all values below the threshold
+    logits.masked_fill_(logits < top_k_thresh, float('-inf'))
+    return logits
 class Sampler(nn.Module):
     def __init__(self):
         input_ids: Optional[torch.Tensor] = None,
     ):
         """
+        Sample tokens from logits with optional top-k and top-p filtering.
+        Condition checking is done OUTSIDE the compiled function to avoid
+        graph breaks from .any() calls.
         """
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        logits = apply_top_k_top_p(
+            logits,
+            top_ks,
+            top_ps,
+        )
         probs = torch.softmax(logits, dim=-1)
         sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
+        return sample_tokens

acestep/third_parts/nano-vllm/pyproject.toml CHANGED Viewed

@@ -15,8 +15,6 @@ dependencies = [
     "triton-windows>=3.0.0; sys_platform == 'win32'",
     "triton>=3.0.0; sys_platform != 'win32'",
     "transformers>=4.51.0",
-    "flash-attn @ https://github.com/sdbds/flash-attention-for-windows/releases/download/2.8.3/flash_attn-2.8.3+cu128torch2.8.0cxx11abiFALSEfullbackward-cp311-cp311-win_amd64.whl; sys_platform == 'win32'",
-    "flash-attn; sys_platform != 'win32'",
     "xxhash",
 ]

     "triton-windows>=3.0.0; sys_platform == 'win32'",
     "triton>=3.0.0; sys_platform != 'win32'",
     "transformers>=4.51.0",
     "xxhash",
 ]

profile_inference.py ADDED Viewed

	@@ -0,0 +1,682 @@

+#!/usr/bin/env python3
+"""
+Enhanced profiling script for ACE-Step inference with deep LLM analysis
+This script helps diagnose why LLM generation is slow by tracking:
+1. Total tokens generated vs expected throughput (200 tokens/sec baseline)
+2. Per-iteration timing to detect compilation overhead or slow operations
+3. Constrained decoding overhead
+4. CFG overhead (2x forward passes)
+5. Model forward time vs sampling/processing time
+Usage:
+    python profile_inference.py                    # Standard profiling with warmup
+    python profile_inference.py --no-warmup        # Profile first run (includes compilation)
+    python profile_inference.py --llm-debug        # Deep LLM performance debugging
+    python profile_inference.py --detailed         # Add cProfile function-level analysis
+    Inference mode options:
+    python profile_inference.py --thinking                        # Enable CoT for code generation
+    python profile_inference.py --use-constrained-decoding        # Use FSM constrained decoding
+    python profile_inference.py --use-cot-metas                  # Enable LM to generate metadata via CoT
+"""
+import time
+import argparse
+import sys
+import os
+from contextlib import contextmanager
+from collections import defaultdict
+import json
+from typing import Tuple, Dict, Any, List
+from functools import wraps
+# Add project root to path
+project_root = os.path.abspath(os.path.dirname(__file__))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+import torch
+from acestep.inference import generate_music, GenerationParams, GenerationConfig
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+class PreciseTimer:
+    """High-precision timer with CUDA synchronization for accurate GPU timing"""
+    def __init__(self, device="cuda"):
+        self.device = device
+        self.timings = defaultdict(list)
+        self.enabled = True
+    def sync(self):
+        """Synchronize CUDA operations for accurate timing"""
+        if self.enabled and self.device.startswith("cuda") and torch.cuda.is_available():
+            torch.cuda.synchronize()
+    @contextmanager
+    def time(self, name: str):
+        """Time a code section with CUDA synchronization"""
+        if not self.enabled:
+            yield
+            return
+        self.sync()
+        start = time.perf_counter()
+        try:
+            yield
+        finally:
+            self.sync()
+            elapsed = time.perf_counter() - start
+            self.timings[name].append(elapsed)
+    def get_total(self, name: str) -> float:
+        """Get total accumulated time for a section"""
+        return sum(self.timings.get(name, []))
+    def get_mean(self, name: str) -> float:
+        """Get mean time per call for a section"""
+        times = self.timings.get(name, [])
+        return sum(times) / len(times) if times else 0.0
+    def get_count(self, name: str) -> int:
+        """Get number of calls for a section"""
+        return len(self.timings.get(name, []))
+    def get_all(self, name: str) -> List[float]:
+        """Get all timing samples for a section"""
+        return self.timings.get(name, [])
+class LLMDebugger:
+    """Track detailed LLM performance metrics to diagnose slow generation"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        """Reset all metrics"""
+        self.total_tokens = 0
+        self.generation_start = None
+        self.generation_end = None
+        self.output_text = ""
+        self.prompt_length = 0
+    def start(self, prompt_length: int = 0):
+        """Mark generation start"""
+        self.generation_start = time.perf_counter()
+        self.prompt_length = prompt_length
+    def end(self, output_text: str = ""):
+        """Mark generation end and store output"""
+        self.generation_end = time.perf_counter()
+        self.output_text = output_text
+    def set_token_count(self, count: int):
+        """Set total token count"""
+        self.total_tokens = count
+    def get_throughput(self) -> float:
+        """Calculate actual tokens per second"""
+        if self.generation_start and self.generation_end and self.total_tokens > 0:
+            total_time = self.generation_end - self.generation_start
+            if total_time > 0:
+                return self.total_tokens / total_time
+        return 0.0
+    def print_analysis(self):
+        """Print detailed LLM performance analysis"""
+        if not self.generation_start or not self.generation_end:
+            return
+        print("\n" + "=" * 100)
+        print("🔍 LLM PERFORMANCE DEEP DIVE")
+        print("=" * 100)
+        total_time = self.generation_end - self.generation_start
+        throughput = self.get_throughput()
+        # Basic metrics table
+        print(f"\n{'Metric':<40} {'Value':<20} {'Notes'}")
+        print("-" * 100)
+        print(f"{'Total Tokens Generated:':<40} {self.total_tokens:<20} (new tokens only)")
+        print(f"{'Prompt Length (estimate):':<40} {self.prompt_length:<20} (input tokens)")
+        print(f"{'Total Generation Time:':<40} {total_time:<20.3f} seconds")
+        print(f"{'Measured Throughput:':<40} {throughput:<20.1f} tokens/sec")
+        print(f"{'Expected Throughput:':<40} {'200':<20} tokens/sec (baseline)")
+        # Calculate performance gap
+        if throughput > 0:
+            slowdown = 200.0 / throughput
+            efficiency = (throughput / 200.0) * 100
+            print(f"{'Performance vs Baseline:':<40} {efficiency:<20.1f}% of expected")
+            print(f"{'Slowdown Factor:':<40} {slowdown:<20.2f}x slower")
+        # Analyze generated output
+        if self.output_text:
+            print(f"\n{'Output Analysis:':<40}")
+            print(f"{'  Output length:':<40} {len(self.output_text):<20} characters")
+            # Count audio codes
+            import re
+            code_pattern = r'<\|audio_code_\d+\|>'
+            codes = re.findall(code_pattern, self.output_text)
+            if codes:
+                print(f"{'  Audio codes generated:':<40} {len(codes):<20} codes")
+                print(f"{'  Expected audio duration:':<40} {f'~{len(codes)/5:.1f}s':<20} (5 codes per second)")
+                if total_time > 0:
+                    print(f"{'  Time per audio code:':<40} {f'{total_time/len(codes)*1000:.1f}ms':<20}")
+            # Check for CoT section
+            if '<think>' in self.output_text and '</think>' in self.output_text:
+                cot_start = self.output_text.find('<think>')
+                cot_end = self.output_text.find('</think>') + 8
+                cot_section = self.output_text[cot_start:cot_end]
+                cot_token_est = len(cot_section) // 4
+                print(f"{'  CoT section tokens (estimate):':<40} {f'~{cot_token_est}':<20}")
+        # Diagnostic guidance
+        print("\n" + "=" * 100)
+        print("🔧 DIAGNOSTIC GUIDANCE")
+        print("=" * 100)
+        if throughput < 50:
+            print("\n⚠️  CRITICAL: Throughput is extremely low (<50 tokens/sec)")
+            print("\nThis is ~4x slower than expected. Likely causes:")
+            print("  1. ❗ Constrained decoding FSM overhead")
+            print("     → Each token triggers FSM state machine validation")
+            print("     → Try: set use_constrained_decoding=False in config")
+            print("  2. ❗ CFG with double forward passes")
+            print("     → cfg_scale > 1.0 means running model twice per token")
+            print("     → Check: params.lm_cfg_scale value")
+            print("  3. ❗ Running in eager mode without compilation")
+            print("     → PyTorch should compile kernels after warmup")
+            print("     → Check: torch._dynamo.config settings")
+        elif throughput < 100:
+            print("\n⚠️  WARNING: Throughput is low (50-100 tokens/sec)")
+            print("\nLikely causes:")
+            print("  1. Constrained decoding overhead (~30-50% slowdown expected)")
+            print("  2. CFG enabled (2x compute per token if cfg_scale > 1.0)")
+            print("  3. Small model or inefficient GPU utilization")
+        elif throughput < 150:
+            print("\n⚠️  Throughput is below baseline but acceptable (100-150 tokens/sec)")
+            print("\nMinor overhead from:")
+            print("  - Constrained decoding: ~20-30% overhead")
+            print("  - Profiling instrumentation: ~5-10% overhead")
+        else:
+            print(f"\n✓ Throughput is good ({throughput:.1f} tokens/sec)")
+            print("  Performance is within acceptable range")
+# Global instances
+timer = None
+llm_debugger = None
+def wrap_method_with_timing(obj, method_name: str, timing_key: str):
+    """Wrap a method with timing instrumentation"""
+    original_method = getattr(obj, method_name)
+    @wraps(original_method)
+    def timed_wrapper(*args, **kwargs):
+        with timer.time(timing_key):
+            return original_method(*args, **kwargs)
+    setattr(obj, method_name, timed_wrapper)
+    return original_method
+def wrap_llm_with_debug_tracking(llm_handler):
+    """Wrap LLM generation with detailed performance tracking"""
+    original_method = llm_handler.generate_with_stop_condition
+    @wraps(original_method)
+    def debug_wrapper(*args, **kwargs):
+        # Estimate prompt length
+        caption = kwargs.get('caption', args[0] if len(args) > 0 else "")
+        lyrics = kwargs.get('lyrics', args[1] if len(args) > 1 else "")
+        prompt_estimate = len(caption) + len(lyrics)
+        prompt_tokens_estimate = prompt_estimate // 4
+        # Start tracking
+        llm_debugger.reset()
+        llm_debugger.start(prompt_length=prompt_tokens_estimate)
+        # Call original with timing
+        with timer.time('llm_inference'):
+            result = original_method(*args, **kwargs)
+        # Extract and analyze output
+        output_text = ""
+        if isinstance(result, tuple) and len(result) >= 2:
+            if isinstance(result[1], list):
+                # Batch mode
+                output_text = "".join(result[1])
+            else:
+                # Single mode
+                cot_output = ""
+                if isinstance(result[0], dict):
+                    for v in result[0].values():
+                        if isinstance(v, str):
+                            cot_output += v
+                output_text = cot_output + str(result[1])
+        # Count tokens
+        import re
+        code_pattern = r'<\|audio_code_\d+\|>'
+        codes = re.findall(code_pattern, output_text)
+        remaining_text = re.sub(code_pattern, '', output_text)
+        cot_tokens_estimate = len(remaining_text) // 4
+        total_tokens = len(codes) + cot_tokens_estimate
+        llm_debugger.set_token_count(total_tokens)
+        llm_debugger.end(output_text)
+        return result
+    llm_handler.generate_with_stop_condition = debug_wrapper
+    return original_method
+def instrument_handlers(dit_handler, llm_handler, enable_llm_debug=False):
+    """Add timing instrumentation to handler methods"""
+    originals = {}
+    # Instrument LLM
+    if llm_handler and llm_handler.llm_initialized:
+        if enable_llm_debug:
+            originals['llm_generate'] = wrap_llm_with_debug_tracking(llm_handler)
+        else:
+            originals['llm_generate'] = wrap_method_with_timing(
+                llm_handler, 'generate_with_stop_condition', 'llm_inference'
+            )
+    # Instrument DiT handler
+    originals['dit_prepare'] = wrap_method_with_timing(
+        dit_handler, 'prepare_batch_data', 'prepare_batch_data'
+    )
+    originals['dit_generate'] = wrap_method_with_timing(
+        dit_handler, 'service_generate', 'dit_inference'
+    )
+    originals['dit_decode'] = wrap_method_with_timing(
+        dit_handler, 'tiled_decode', 'vae_decode'
+    )
+    return originals
+def restore_handlers(dit_handler, llm_handler, originals):
+    """Restore original handler methods after profiling"""
+    if llm_handler and 'llm_generate' in originals:
+        llm_handler.generate_with_stop_condition = originals['llm_generate']
+    dit_handler.prepare_batch_data = originals['dit_prepare']
+    dit_handler.service_generate = originals['dit_generate']
+    dit_handler.tiled_decode = originals['dit_decode']
+def print_profiling_results(total_time: float, show_llm_debug: bool = False):
+    """Print comprehensive profiling results with performance insights"""
+    print("\n" + "=" * 100)
+    print("🎯 PROFILING RESULTS")
+    print("=" * 100)
+    # Define timing categories
+    model_sections = {
+        'llm_inference': 'LLM Inference (5Hz Language Model)',
+        'dit_inference': 'DiT Inference (Diffusion Transformer)',
+        'vae_decode': 'VAE Decode (Audio Decoder)',
+    }
+    non_model_sections = {
+        'prepare_batch_data': 'Prepare Batch Data (embedding, formatting)',
+    }
+    # Calculate totals
+    model_time = sum(timer.get_total(k) for k in model_sections.keys())
+    non_model_time = sum(timer.get_total(k) for k in non_model_sections.keys())
+    other_time = total_time - model_time - non_model_time
+    # Print summary table
+    print(f"\n{'CATEGORY':<50} {'TIME (s)':<12} {'%':<8} {'CALLS':<8}")
+    print("-" * 100)
+    # Model time breakdown
+    print(f"\n{'🤖 MODEL TIME (Total)':<50} {model_time:<12.3f} {100*model_time/total_time:>6.1f}% {'':<8}")
+    for key, desc in model_sections.items():
+        t = timer.get_total(key)
+        c = timer.get_count(key)
+        if c > 0:
+            mean = timer.get_mean(key)
+            pct = 100 * t / total_time
+            print(f"  {'├─ ' + desc:<48} {t:<12.3f} {pct:>6.1f}% {c:<8} (avg: {mean:.3f}s)")
+    # Non-model time breakdown
+    print(f"\n{'⚙️  NON-MODEL TIME (Total)':<50} {non_model_time:<12.3f} {100*non_model_time/total_time:>6.1f}% {'':<8}")
+    for key, desc in non_model_sections.items():
+        t = timer.get_total(key)
+        c = timer.get_count(key)
+        if c > 0:
+            mean = timer.get_mean(key)
+            pct = 100 * t / total_time
+            print(f"  {'├─ ' + desc:<48} {t:<12.3f} {pct:>6.1f}% {c:<8} (avg: {mean:.3f}s)")
+    # Other time
+    if other_time > 0.01:
+        pct = 100 * other_time / total_time
+        print(f"\n{'📦 OTHER TIME (I/O, overhead, audio save)':<50} {other_time:<12.3f} {pct:>6.1f}% {'':<8}")
+    print(f"\n{'📊 TOTAL TIME':<50} {total_time:<12.3f} {'100.0%':>6} {'':<8}")
+    # Show LLM detailed analysis if enabled
+    if show_llm_debug:
+        llm_debugger.print_analysis()
+    # Performance insights
+    print("\n" + "=" * 100)
+    print("💡 PERFORMANCE INSIGHTS")
+    print("=" * 100)
+    llm_t = timer.get_total('llm_inference')
+    dit_t = timer.get_total('dit_inference')
+    vae_t = timer.get_total('vae_decode')
+    prep_t = timer.get_total('prepare_batch_data')
+    # Model time insights
+    if model_time > 0:
+        print(f"\n✓ Model operations: {model_time:.3f}s ({100*model_time/total_time:.1f}% of total)")
+        if llm_t > 0:
+            print(f"  - LLM: {llm_t:.3f}s ({100*llm_t/model_time:.1f}% of model time)")
+        if dit_t > 0:
+            print(f"  - DiT: {dit_t:.3f}s ({100*dit_t/model_time:.1f}% of model time)")
+        if vae_t > 0:
+            print(f"  - VAE: {vae_t:.3f}s ({100*vae_t/model_time:.1f}% of model time)")
+    # LLM bottleneck analysis
+    if llm_t > dit_t and llm_t > 5.0:
+        print(f"\n⚠️  LLM IS THE BOTTLENECK: {llm_t:.3f}s ({100*llm_t/total_time:.1f}% of total)")
+        print(f"\n   Possible causes:")
+        print(f"   1. Generating too many tokens → use --llm-debug to verify")
+        print(f"   2. Constrained decoding overhead → FSM validation per token")
+        print(f"   3. CFG overhead → cfg_scale > 1.0 = 2x forward passes")
+        print(f"   4. First-token latency → warmup should help")
+        print(f"   5. KV cache inefficiency → should be ~5-10ms/token")
+    # Non-model insights
+    if non_model_time / total_time > 0.1:
+        print(f"\n⚠️  Non-model operations: {non_model_time:.3f}s ({100*non_model_time/total_time:.1f}%)")
+        if prep_t > 0.1:
+            print(f"   - Batch preparation: {prep_t:.3f}s")
+    # I/O overhead
+    if other_time / total_time > 0.2:
+        print(f"\n⚠️  Overhead/I/O: {other_time:.3f}s ({100*other_time/total_time:.1f}%)")
+    # Recommendations
+    print("\n" + "=" * 100)
+    print("🚀 OPTIMIZATION RECOMMENDATIONS")
+    print("=" * 100)
+    if llm_t > dit_t * 2:
+        print("\n🎯 Priority: Optimize LLM")
+        print("  1. Run: python profile_inference.py --llm-debug")
+        print("     → Shows exact token count and throughput")
+        print("  2. Check constrained decoding overhead")
+        print("  3. Check CFG scaling (lm_cfg_scale parameter)")
+        print("  4. Profile nanovllm engine step() timing")
+        print("  5. Compare vllm vs transformers backends")
+def run_profiled_generation(dit_handler, llm_handler, params, config,
+                           enable_cprofile=False, enable_llm_debug=False):
+    """Execute generation with full profiling instrumentation"""
+    # Instrument handlers
+    originals = instrument_handlers(dit_handler, llm_handler, enable_llm_debug)
+    try:
+        print("\n[Profiling] Starting generation...")
+        timer.sync()
+        total_start = time.perf_counter()
+        # Optional cProfile
+        prof = None
+        if enable_cprofile:
+            import cProfile
+            prof = cProfile.Profile()
+            prof.enable()
+        # Run generation
+        result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
+        # Stop timing
+        timer.sync()
+        total_time = time.perf_counter() - total_start
+        # Save cProfile if enabled
+        if enable_cprofile and prof:
+            prof.disable()
+            import pstats
+            import io
+            output_file = "profile_cprofile_detailed.txt"
+            with open(output_file, 'w') as f:
+                ps = pstats.Stats(prof, stream=f)
+                ps.sort_stats('cumulative')
+                ps.print_stats(100)
+            # Print top functions
+            print("\n" + "=" * 100)
+            print("📊 TOP 20 FUNCTIONS BY CUMULATIVE TIME (cProfile)")
+            print("=" * 100)
+            s = io.StringIO()
+            ps = pstats.Stats(prof, stream=s)
+            ps.sort_stats('cumulative')
+            ps.print_stats(20)
+            print(s.getvalue())
+            print(f"\nFull report: {output_file}")
+        # Print results
+        print_profiling_results(total_time, show_llm_debug=enable_llm_debug)
+        return result, total_time
+    finally:
+        restore_handlers(dit_handler, llm_handler, originals)
+def load_example_config(example_file: str) -> Tuple[GenerationParams, GenerationConfig]:
+    """Load configuration from example JSON file"""
+    try:
+        with open(example_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        params = GenerationParams(
+            caption=data.get('caption', ''),
+            lyrics=data.get('lyrics', ''),
+            bpm=data.get('bpm'),
+            keyscale=data.get('keyscale', ''),
+            timesignature=data.get('timesignature', ''),
+            vocal_language=data.get('language', 'unknown'),
+            duration=data.get('duration'),
+            thinking=data.get('think', False),
+            inference_steps=data.get('inference_steps', 8),
+            seed=data.get('seed', 42),
+        )
+        config = GenerationConfig(batch_size=data.get('batch_size', 1), seeds=[42])
+        return params, config
+    except Exception as e:
+        print(f"  ❌ Failed to load: {e}")
+        return None, None
+def main():
+    global timer, llm_debugger
+    parser = argparse.ArgumentParser(
+        description="Profile ACE-Step inference with LLM debugging"
+    )
+    parser.add_argument("--checkpoint-dir", type=str, default="./checkpoints")
+    parser.add_argument("--config-path", type=str, default="acestep-v15-turbo-rl")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--lm-model", type=str, default="acestep-5Hz-lm-0.6B-v3")
+    parser.add_argument("--lm-backend", type=str, default="vllm")
+    parser.add_argument("--no-warmup", action="store_true")
+    parser.add_argument("--detailed", action="store_true")
+    parser.add_argument("--llm-debug", action="store_true",
+                       help="Enable deep LLM debugging (token count, throughput)")
+    parser.add_argument("--example", type=str, default="example_05.json")
+    # Inference mode parameters
+    parser.add_argument("--thinking", action="store_true",
+                       help="Enable CoT reasoning for LM to generate audio codes")
+    parser.add_argument("--use-constrained-decoding", action="store_true",
+                       help="Use FSM-based constrained decoding for meta generation")
+    parser.add_argument("--use-cot-metas", action="store_true",
+                       help="Enable LLM to generate music metadata via CoT reasoning")
+    args = parser.parse_args()
+    # Initialize
+    timer = PreciseTimer(device=args.device)
+    llm_debugger = LLMDebugger()
+    print("=" * 100)
+    print("🎵 ACE-Step Inference Profiler (LLM Performance Analysis)")
+    print("=" * 100)
+    print(f"\nConfiguration:")
+    print(f"  Device: {args.device}")
+    print(f"  LLM Backend: {args.lm_backend}")
+    print(f"  LLM Debug: {'Enabled' if args.llm_debug else 'Disabled'}")
+    print(f"  Warmup: {'Disabled' if args.no_warmup else 'Enabled'}")
+    print(f"\nInference Mode:")
+    print(f"  Thinking (CoT): {'Enabled' if args.thinking else 'Disabled'}")
+    print(f"  Constrained Decoding: {'Enabled' if args.use_constrained_decoding else 'Disabled'}")
+    print(f"  Use CoT for Metas: {'Enabled' if args.use_cot_metas else 'Disabled'}")
+    # Initialize models
+    print(f"\nInitializing models...")
+    dit_handler = AceStepHandler()
+    llm_handler = LLMHandler()
+    print("  🎹 Initializing DiT...")
+    status_dit, success_dit = dit_handler.initialize_service(
+        project_root=project_root,
+        config_path=args.config_path,
+        device=args.device,
+        use_flash_attention=True,
+    )
+    if not success_dit:
+        print(f"  ❌ Failed: {status_dit}")
+        sys.exit(1)
+    print(f"     ✓ DiT ready")
+    print("  🧠 Initializing LLM...")
+    if args.thinking or args.use_cot_metas:
+        status_llm, success_llm = llm_handler.initialize(
+            checkpoint_dir=args.checkpoint_dir,
+            lm_model_path=args.lm_model,
+            backend=args.lm_backend,
+            device=args.device,
+        )
+        if success_llm:
+            print(f"     ✓ LLM ready ({args.lm_backend})")
+        else:
+            print(f"     ⚠ Failed: {status_llm}")
+    else:
+        print(f"     ✓ LLM not initialized (thinking or use_cot_metas is disabled)")
+    # Load example
+    example_file = os.path.join(project_root, "examples", "text2music", args.example)
+    if not os.path.exists(example_file):
+        print(f"\n❌ Not found: {example_file}")
+        sys.exit(1)
+    print(f"\n📄 Loading: {args.example}")
+    params, config = load_example_config(example_file)
+    if not params or not config:
+        print("❌ Failed to load config")
+        sys.exit(1)
+    print(f"   Caption: {params.caption[:60]}...")
+    print(f"   Batch: {config.batch_size}, Steps: {params.inference_steps}, LLM: {params.thinking}")
+    # Warmup
+    if not args.no_warmup:
+        print("\n" + "=" * 100)
+        print("🔥 WARMUP RUN")
+        print("=" * 100)
+        warmup_params = GenerationParams(
+            caption=params.caption,
+            lyrics=params.lyrics,
+            bpm=params.bpm,
+            keyscale=params.keyscale,
+            timesignature=params.timesignature,
+            vocal_language=params.vocal_language,
+            duration=params.duration,
+            thinking=args.thinking,
+            use_cot_metas=args.use_cot_metas,
+            inference_steps=params.inference_steps,
+            seed=params.seed,
+        )
+        warmup_config = GenerationConfig(batch_size=1, seeds=[42])
+        warmup_config.use_constrained_decoding = args.use_constrained_decoding
+        warmup_start = time.perf_counter()
+        warmup_result = generate_music(dit_handler, llm_handler, warmup_params, warmup_config, save_dir="./")
+        warmup_time = time.perf_counter() - warmup_start
+        print(f"\n✓ Warmup: {warmup_time:.2f}s")
+        if not warmup_result.success:
+            print(f"⚠️  Warning: {warmup_result.error}")
+        # Reset
+        timer = PreciseTimer(device=args.device)
+        llm_debugger = LLMDebugger()
+    # Profiling run
+    print("\n" + "=" * 100)
+    print("⏱️  PROFILING RUN")
+    print("=" * 100)
+    # Apply inference mode settings
+    config.use_constrained_decoding = args.use_constrained_decoding
+    # Override thinking and use_cot_metas parameters if specified via CLI
+    if args.thinking:
+        params.thinking = True
+    if args.use_cot_metas:
+        params.use_cot_metas = True
+    result, total_time = run_profiled_generation(
+        dit_handler, llm_handler, params, config,
+        enable_cprofile=args.detailed,
+        enable_llm_debug=args.llm_debug
+    )
+    if not result.success:
+        print(f"\n❌ Failed: {result.error}")
+        sys.exit(1)
+    print(f"\n✅ Success! Generated {len(result.audios)} audio file(s)")
+    # Final tips
+    if args.detailed:
+        print("\n💡 Check profile_cprofile_detailed.txt for function-level analysis")
+    elif not args.llm_debug:
+        print("\n💡 Run with --llm-debug to see LLM token count and throughput analysis")
+if __name__ == "__main__":
+    main()