Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on 30 days ago

Commit

bc7e55b

1 Parent(s): 7ef4a67

support input timesteps

Browse files

Files changed (10) hide show

acestep/api_server.py +12 -1
acestep/gradio_ui/events/__init__.py +16 -1
acestep/gradio_ui/events/generation_handlers.py +102 -36
acestep/gradio_ui/events/results_handlers.py +18 -5
acestep/gradio_ui/i18n/en.json +7 -2
acestep/gradio_ui/i18n/ja.json +7 -2
acestep/gradio_ui/i18n/zh.json +7 -2
acestep/gradio_ui/interfaces/generation.py +11 -0
acestep/handler.py +13 -1
acestep/inference.py +4 -0

acestep/api_server.py CHANGED Viewed

@@ -102,6 +102,10 @@ class GenerateMusicRequest(BaseModel):
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
     audio_format: str = "mp3"
     use_tiled_decode: bool = True
@@ -754,13 +758,14 @@ def create_app() -> FastAPI:
                     keyscale=key_scale,
                     timesignature=time_signature,
                     duration=audio_duration if audio_duration else -1.0,
-                    inference_steps=req.inference_steps,
                     seed=req.seed,
                     guidance_scale=req.guidance_scale,
                     use_adg=req.use_adg,
                     cfg_interval_start=req.cfg_interval_start,
                     cfg_interval_end=req.cfg_interval_end,
                     infer_method=req.infer_method,
                     repainting_start=req.repainting_start,
                     repainting_end=req.repainting_end if req.repainting_end else -1,
                     audio_cover_strength=req.audio_cover_strength,
@@ -1289,5 +1294,11 @@ def main() -> None:
     )
 if __name__ == "__main__":
     main()

     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    timesteps: Optional[str] = Field(
+        default=None,
+        description="Custom timesteps (comma-separated, e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0')"
+    )
     audio_format: str = "mp3"
     use_tiled_decode: bool = True
                     keyscale=key_scale,
                     timesignature=time_signature,
                     duration=audio_duration if audio_duration else -1.0,
+                    inference_steps=actual_inference_steps,
                     seed=req.seed,
                     guidance_scale=req.guidance_scale,
                     use_adg=req.use_adg,
                     cfg_interval_start=req.cfg_interval_start,
                     cfg_interval_end=req.cfg_interval_end,
                     infer_method=req.infer_method,
+                    timesteps=parsed_timesteps,
                     repainting_start=req.repainting_start,
                     repainting_end=req.repainting_end if req.repainting_end else -1,
                     audio_cover_strength=req.audio_cover_strength,
     )
+if __name__ == "__main__":
+    main()
+,
+    )
 if __name__ == "__main__":
     main()

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -54,7 +54,19 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["offload_to_cpu_checkbox"],
             generation_section["offload_dit_to_cpu_checkbox"],
         ],
-        outputs=[generation_section["init_status"], generation_section["generate_btn"], generation_section["service_config_accordion"]]
     )
     # ========== UI Visibility Updates ==========
@@ -312,6 +324,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
@@ -510,6 +523,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],
@@ -697,6 +711,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],

             generation_section["offload_to_cpu_checkbox"],
             generation_section["offload_dit_to_cpu_checkbox"],
         ],
+        outputs=[
+            generation_section["init_status"],
+            generation_section["generate_btn"],
+            generation_section["service_config_accordion"],
+            # Model type settings (updated based on actual loaded model)
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["shift"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
     )
     # ========== UI Visibility Updates ==========
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
+            generation_section["custom_timesteps"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
+            generation_section["custom_timesteps"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["infer_method"],
+            generation_section["custom_timesteps"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -7,7 +7,7 @@ import json
 import random
 import glob
 import gradio as gr
-from typing import Optional
 from acestep.constants import (
     TASK_TYPES_TURBO,
     TASK_TYPES_BASE,
@@ -16,6 +16,56 @@ from acestep.gradio_ui.i18n import t
 from acestep.inference import understand_music, create_sample, format_sample
 def load_metadata(file_obj):
     """Load generation parameters from a JSON file"""
     if file_obj is None:
@@ -321,50 +371,31 @@ def refresh_checkpoints(dit_handler):
 def update_model_type_settings(config_path):
-    """Update UI settings based on model type"""
     if config_path is None:
         config_path = ""
     config_path_lower = config_path.lower()
     if "turbo" in config_path_lower:
-        # Turbo model: max 8 steps, hide CFG/ADG/shift, only show text2music/repaint/cover
-        # Shift is not effective for turbo models, default to 1.0
-        return (
-            gr.update(value=8, maximum=8, minimum=1),  # inference_steps
-            gr.update(visible=False),  # guidance_scale
-            gr.update(visible=False),  # use_adg
-            gr.update(value=1.0, visible=False),  # shift (not effective for turbo)
-            gr.update(visible=False),  # cfg_interval_start
-            gr.update(visible=False),  # cfg_interval_end
-            gr.update(choices=TASK_TYPES_TURBO),  # task_type
-        )
     elif "base" in config_path_lower:
-        # Base model: max 100 steps, show CFG/ADG/shift, show all task types
-        # Shift range 1.0~5.0, default 3.0 for base models
-        return (
-            gr.update(value=32, maximum=100, minimum=1),  # inference_steps
-            gr.update(visible=True),  # guidance_scale
-            gr.update(visible=True),  # use_adg
-            gr.update(value=3.0, visible=True),  # shift (effective for base, default 3.0)
-            gr.update(visible=True),  # cfg_interval_start
-            gr.update(visible=True),  # cfg_interval_end
-            gr.update(choices=TASK_TYPES_BASE),  # task_type
-        )
     else:
-        # Default to turbo settings
-        return (
-            gr.update(value=8, maximum=8, minimum=1),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(value=1.0, visible=False),  # shift default 1.0
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(choices=TASK_TYPES_TURBO),  # task_type
-        )
 def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
-    """Wrapper for service initialization, returns status, button state, and accordion state"""
     # Initialize DiT handler
     status, enable = dit_handler.initialize_service(
         checkpoint, config_path, device,
@@ -400,7 +431,42 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
     is_model_initialized = dit_handler.model is not None
     accordion_state = gr.update(open=not is_model_initialized)
-    return status, gr.update(interactive=enable), accordion_state
 def update_negative_prompt_visibility(init_llm_checked):

 import random
 import glob
 import gradio as gr
+from typing import Optional, List, Tuple
 from acestep.constants import (
     TASK_TYPES_TURBO,
     TASK_TYPES_BASE,
 from acestep.inference import understand_music, create_sample, format_sample
+def parse_and_validate_timesteps(
+    timesteps_str: str,
+    inference_steps: int
+) -> Tuple[Optional[List[float]], bool, str]:
+    """
+    Parse timesteps string and validate.
+    Args:
+        timesteps_str: Comma-separated timesteps string (e.g., "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+        inference_steps: Expected number of inference steps
+    Returns:
+        Tuple of (parsed_timesteps, has_warning, warning_message)
+        - parsed_timesteps: List of float timesteps, or None if invalid/empty
+        - has_warning: Whether a warning was shown
+        - warning_message: Description of the warning
+    """
+    if not timesteps_str or not timesteps_str.strip():
+        return None, False, ""
+    # Parse comma-separated values
+    values = [v.strip() for v in timesteps_str.split(",") if v.strip()]
+    if not values:
+        return None, False, ""
+    # Handle optional trailing 0
+    if values[-1] != "0":
+        values.append("0")
+    try:
+        timesteps = [float(v) for v in values]
+    except ValueError:
+        gr.Warning(t("messages.invalid_timesteps_format"))
+        return None, True, "Invalid format"
+    # Validate range [0, 1]
+    if any(ts < 0 or ts > 1 for ts in timesteps):
+        gr.Warning(t("messages.timesteps_out_of_range"))
+        return None, True, "Out of range"
+    # Check if count matches inference_steps
+    actual_steps = len(timesteps) - 1
+    if actual_steps != inference_steps:
+        gr.Warning(t("messages.timesteps_count_mismatch", actual=actual_steps, expected=inference_steps))
+        return timesteps, True, f"Using {actual_steps} steps from timesteps"
+    return timesteps, False, ""
 def load_metadata(file_obj):
     """Load generation parameters from a JSON file"""
     if file_obj is None:
 def update_model_type_settings(config_path):
+    """Update UI settings based on model type (fallback when handler not initialized yet)
+    Note: This is used as a fallback when the user changes config_path dropdown
+    before initializing the model. The actual settings are determined by the
+    handler's is_turbo_model() method after initialization.
+    """
     if config_path is None:
         config_path = ""
     config_path_lower = config_path.lower()
+    # Determine is_turbo based on config_path string
+    # This is a heuristic fallback - actual model type is determined after loading
     if "turbo" in config_path_lower:
+        is_turbo = True
     elif "base" in config_path_lower:
+        is_turbo = False
     else:
+        # Default to turbo settings for unknown model types
+        is_turbo = True
+    return get_model_type_ui_settings(is_turbo)
 def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
+    """Wrapper for service initialization, returns status, button state, accordion state, and model type settings"""
     # Initialize DiT handler
     status, enable = dit_handler.initialize_service(
         checkpoint, config_path, device,
     is_model_initialized = dit_handler.model is not None
     accordion_state = gr.update(open=not is_model_initialized)
+    # Get model type settings based on actual loaded model
+    is_turbo = dit_handler.is_turbo_model()
+    model_type_settings = get_model_type_ui_settings(is_turbo)
+    return (
+        status,
+        gr.update(interactive=enable),
+        accordion_state,
+        *model_type_settings
+    )
+def get_model_type_ui_settings(is_turbo: bool):
+    """Get UI settings based on whether the model is turbo or base"""
+    if is_turbo:
+        # Turbo model: max 8 steps, hide CFG/ADG/shift, only show text2music/repaint/cover
+        return (
+            gr.update(value=8, maximum=8, minimum=1),  # inference_steps
+            gr.update(visible=False),  # guidance_scale
+            gr.update(visible=False),  # use_adg
+            gr.update(value=1.0, visible=False),  # shift (not effective for turbo)
+            gr.update(visible=False),  # cfg_interval_start
+            gr.update(visible=False),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_TURBO),  # task_type
+        )
+    else:
+        # Base model: max 200 steps, default 32, show CFG/ADG/shift, show all task types
+        return (
+            gr.update(value=32, maximum=200, minimum=1),  # inference_steps
+            gr.update(visible=True),  # guidance_scale
+            gr.update(visible=True),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (effective for base, default 3.0)
+            gr.update(visible=True),  # cfg_interval_start
+            gr.update(visible=True),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_BASE),  # task_type
+        )
 def update_negative_prompt_visibility(init_llm_checked):

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import Dict, Any, Optional, List
 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.audio_utils import save_audio
@@ -452,7 +453,7 @@ def generate_with_progress(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
@@ -473,6 +474,14 @@ def generate_with_progress(
         logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
         gr.Info(t("messages.skipping_metas_cot"))
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
@@ -489,13 +498,14 @@ def generate_with_progress(
         keyscale=key_scale,
         timesignature=time_signature,
         duration=audio_duration,
-        inference_steps=inference_steps,
         guidance_scale=guidance_scale,
         use_adg=use_adg,
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
         shift=shift,
         infer_method=infer_method,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
@@ -1311,7 +1321,7 @@ def capture_current_params(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language,
     constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
@@ -1349,6 +1359,7 @@ def capture_current_params(
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "infer_method": infer_method,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
@@ -1377,7 +1388,7 @@ def generate_with_batch_management(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
@@ -1406,7 +1417,7 @@ def generate_with_batch_management(
         reference_audio, audio_duration, batch_size_input, src_audio,
         text2music_audio_code_string, repainting_start, repainting_end,
         instruction_display_gen, audio_cover_strength, task_type,
-        use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
         constrained_decoding_debug,
@@ -1673,6 +1684,7 @@ def generate_next_batch_background(
         params.setdefault("cfg_interval_end", 1.0)
         params.setdefault("shift", 1.0)
         params.setdefault("infer_method", "ode")
         params.setdefault("audio_format", "mp3")
         params.setdefault("lm_temperature", 0.85)
         params.setdefault("think_checkbox", True)
@@ -1724,6 +1736,7 @@ def generate_next_batch_background(
             cfg_interval_end=params.get("cfg_interval_end"),
             shift=params.get("shift"),
             infer_method=params.get("infer_method"),
             audio_format=params.get("audio_format"),
             lm_temperature=params.get("lm_temperature"),
             think_checkbox=params.get("think_checkbox"),

 import gradio as gr
 from loguru import logger
 from acestep.gradio_ui.i18n import t
+from acestep.gradio_ui.events.generation_handlers import parse_and_validate_timesteps
 from acestep.inference import generate_music, GenerationParams, GenerationConfig
 from acestep.audio_utils import save_audio
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, custom_timesteps, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
         logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
         gr.Info(t("messages.skipping_metas_cot"))
+    # Parse and validate custom timesteps
+    parsed_timesteps, has_timesteps_warning, _ = parse_and_validate_timesteps(custom_timesteps, inference_steps)
+    # Update inference_steps if custom timesteps provided (to match UI display)
+    actual_inference_steps = inference_steps
+    if parsed_timesteps is not None:
+        actual_inference_steps = len(parsed_timesteps) - 1
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
         keyscale=key_scale,
         timesignature=time_signature,
         duration=audio_duration,
+        inference_steps=actual_inference_steps,
         guidance_scale=guidance_scale,
         use_adg=use_adg,
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
         shift=shift,
         infer_method=infer_method,
+        timesteps=parsed_timesteps,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, custom_timesteps, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language,
     constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "infer_method": infer_method,
+        "custom_timesteps": custom_timesteps,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, custom_timesteps, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
         reference_audio, audio_duration, batch_size_input, src_audio,
         text2music_audio_code_string, repainting_start, repainting_end,
         instruction_display_gen, audio_cover_strength, task_type,
+        use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, custom_timesteps, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
         constrained_decoding_debug,
         params.setdefault("cfg_interval_end", 1.0)
         params.setdefault("shift", 1.0)
         params.setdefault("infer_method", "ode")
+        params.setdefault("custom_timesteps", "")
         params.setdefault("audio_format", "mp3")
         params.setdefault("lm_temperature", 0.85)
         params.setdefault("think_checkbox", True)
             cfg_interval_end=params.get("cfg_interval_end"),
             shift=params.get("shift"),
             infer_method=params.get("infer_method"),
+            custom_timesteps=params.get("custom_timesteps"),
             audio_format=params.get("audio_format"),
             lm_temperature=params.get("lm_temperature"),
             think_checkbox=params.get("think_checkbox"),

acestep/gradio_ui/i18n/en.json CHANGED Viewed

@@ -115,7 +115,7 @@
     "batch_size_info": "Number of audio to generate (max 8)",
     "advanced_settings": "🔧 Advanced Settings",
     "inference_steps_label": "DiT Inference Steps",
-    "inference_steps_info": "Turbo: max 8, Base: max 100",
     "guidance_scale_label": "DiT Guidance Scale (Only support for base model)",
     "guidance_scale_info": "Higher values follow text more closely",
     "seed_label": "Seed",
@@ -130,6 +130,8 @@
     "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
     "infer_method_label": "Inference Method",
     "infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
     "cfg_interval_start": "CFG Interval Start",
     "cfg_interval_end": "CFG Interval End",
     "lm_params_title": "🤖 LM Generation Parameters",
@@ -233,6 +235,9 @@
     "simple_example_loaded": "🎲 Loaded random example from {filename}",
     "format_success": "✅ Caption and lyrics formatted successfully",
     "format_failed": "❌ Format failed: {error}",
-    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
   }
 }

     "batch_size_info": "Number of audio to generate (max 8)",
     "advanced_settings": "🔧 Advanced Settings",
     "inference_steps_label": "DiT Inference Steps",
+    "inference_steps_info": "Turbo: max 8, Base: max 200",
     "guidance_scale_label": "DiT Guidance Scale (Only support for base model)",
     "guidance_scale_info": "Higher values follow text more closely",
     "seed_label": "Seed",
     "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
     "infer_method_label": "Inference Method",
     "infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
+    "custom_timesteps_label": "Custom Timesteps",
+    "custom_timesteps_info": "Optional: comma-separated values from 1.0 to 0.0 (e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference steps and shift.",
     "cfg_interval_start": "CFG Interval Start",
     "cfg_interval_end": "CFG Interval End",
     "lm_params_title": "🤖 LM Generation Parameters",
     "simple_example_loaded": "🎲 Loaded random example from {filename}",
     "format_success": "✅ Caption and lyrics formatted successfully",
     "format_failed": "❌ Format failed: {error}",
+    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)",
+    "invalid_timesteps_format": "⚠️ Invalid timesteps format. Using default schedule.",
+    "timesteps_out_of_range": "⚠️ Timesteps must be in range [0, 1]. Using default schedule.",
+    "timesteps_count_mismatch": "⚠️ Timesteps count ({actual}) differs from inference_steps ({expected}). Using timesteps count."
   }
 }

acestep/gradio_ui/i18n/ja.json CHANGED Viewed

@@ -115,7 +115,7 @@
     "batch_size_info": "生成するオーディオの数(最大8)",
     "advanced_settings": "🔧 詳細設定",
     "inference_steps_label": "DiT 推論ステップ",
-    "inference_steps_info": "Turbo: 最大8、Base: 最大100",
     "guidance_scale_label": "DiT ガイダンススケール(baseモデルのみサポート)",
     "guidance_scale_info": "値が高いほどテキストに忠実に従う",
     "seed_label": "シード",
@@ -130,6 +130,8 @@
     "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
     "infer_method_label": "推論方法",
     "infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
     "cfg_interval_start": "CFG 間隔開始",
     "cfg_interval_end": "CFG 間隔終了",
     "lm_params_title": "🤖 LM 生成パラメータ",
@@ -233,6 +235,9 @@
     "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
     "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
     "format_failed": "❌ フォーマットに失敗しました: {error}",
-    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）"
   }
 }

     "batch_size_info": "生成するオーディオの数(最大8)",
     "advanced_settings": "🔧 詳細設定",
     "inference_steps_label": "DiT 推論ステップ",
+    "inference_steps_info": "Turbo: 最大8、Base: 最大200",
     "guidance_scale_label": "DiT ガイダンススケール(baseモデルのみサポート)",
     "guidance_scale_info": "値が高いほどテキストに忠実に従う",
     "seed_label": "シード",
     "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
     "infer_method_label": "推論方法",
     "infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
+    "custom_timesteps_label": "カスタムタイムステップ",
+    "custom_timesteps_info": "オプション：1.0から0.0へのカンマ区切り値（例：'0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。推論ステップとシフトを上書きします。",
     "cfg_interval_start": "CFG 間隔開始",
     "cfg_interval_end": "CFG 間隔終了",
     "lm_params_title": "🤖 LM 生成パラメータ",
     "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
     "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
     "format_failed": "❌ フォーマットに失敗しました: {error}",
+    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）",
+    "invalid_timesteps_format": "⚠️ タイムステップ形式が無効です。デフォルトスケジュールを使用します。",
+    "timesteps_out_of_range": "⚠️ タイムステップは [0, 1] の範囲内である必要があります。デフォルトスケジュールを使用します。",
+    "timesteps_count_mismatch": "⚠️ タイムステップ数 ({actual}) が推論ステップ数 ({expected}) と異なります。タイムステップ数を使用します。"
   }
 }

acestep/gradio_ui/i18n/zh.json CHANGED Viewed

@@ -115,7 +115,7 @@
     "batch_size_info": "要生成的音频数量(最多8个)",
     "advanced_settings": "🔧 高级设置",
     "inference_steps_label": "DiT 推理步数",
-    "inference_steps_info": "Turbo: 最多8, Base: 最多100",
     "guidance_scale_label": "DiT 引导比例(仅支持base模型)",
     "guidance_scale_info": "更高的值更紧密地遵循文本",
     "seed_label": "种子",
@@ -130,6 +130,8 @@
     "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
     "infer_method_label": "推理方法",
     "infer_method_info": "扩散推理方法。ODE (欧拉) 更快，SDE (随机) 可能产生不同结果。",
     "cfg_interval_start": "CFG 间隔开始",
     "cfg_interval_end": "CFG 间隔结束",
     "lm_params_title": "🤖 LM 生成参数",
@@ -233,6 +235,9 @@
     "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
     "format_success": "✅ 描述和歌词格式化成功",
     "format_failed": "❌ 格式化失败: {error}",
-    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）"
   }
 }

     "batch_size_info": "要生成的音频数量(最多8个)",
     "advanced_settings": "🔧 高级设置",
     "inference_steps_label": "DiT 推理步数",
+    "inference_steps_info": "Turbo: 最多8, Base: 最多200",
     "guidance_scale_label": "DiT 引导比例(仅支持base模型)",
     "guidance_scale_info": "更高的值更紧密地遵循文本",
     "seed_label": "种子",
     "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
     "infer_method_label": "推理方法",
     "infer_method_info": "扩散推理方法。ODE (欧拉) 更快，SDE (随机) 可能产生不同结果。",
+    "custom_timesteps_label": "自定义时间步",
+    "custom_timesteps_info": "可选：从 1.0 到 0.0 的逗号分隔值（例如 '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。会覆盖推理步数和 shift 设置。",
     "cfg_interval_start": "CFG 间隔开始",
     "cfg_interval_end": "CFG 间隔结束",
     "lm_params_title": "🤖 LM 生成参数",
     "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
     "format_success": "✅ 描述和歌词格式化成功",
     "format_failed": "❌ 格式化失败: {error}",
+    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）",
+    "invalid_timesteps_format": "⚠️ 时间步格式无效，使用默认调度。",
+    "timesteps_out_of_range": "⚠️ 时间步必须在 [0, 1] 范围内，使用默认调度。",
+    "timesteps_count_mismatch": "⚠️ 时间步数量 ({actual}) 与推理步数 ({expected}) 不匹配，将使用时间步数量。"
   }
 }

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -402,6 +402,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                         )
         # Advanced Settings
         with gr.Accordion(t("generation.advanced_settings"), open=False):
             with gr.Row():
                 inference_steps = gr.Slider(
@@ -462,6 +464,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     info=t("generation.infer_method_info"),
                 )
             with gr.Row():
                 cfg_interval_start = gr.Slider(
                     minimum=0.0,
@@ -698,6 +708,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "infer_method": infer_method,
         "audio_format": audio_format,
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,

                         )
         # Advanced Settings
+        # Default UI settings use turbo mode (max 8 steps, hide CFG/ADG/shift)
+        # These will be updated after model initialization based on handler.is_turbo_model()
         with gr.Accordion(t("generation.advanced_settings"), open=False):
             with gr.Row():
                 inference_steps = gr.Slider(
                     info=t("generation.infer_method_info"),
                 )
+            with gr.Row():
+                custom_timesteps = gr.Textbox(
+                    label=t("generation.custom_timesteps_label"),
+                    placeholder="0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+                    value="",
+                    info=t("generation.custom_timesteps_info"),
+                )
             with gr.Row():
                 cfg_interval_start = gr.Slider(
                     minimum=0.0,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "infer_method": infer_method,
+        "custom_timesteps": custom_timesteps,
         "audio_format": audio_format,
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,

acestep/handler.py CHANGED Viewed

@@ -108,6 +108,12 @@ class AceStepHandler:
         except ImportError:
             return False
     def initialize_service(
         self,
         project_root: str,
@@ -1786,6 +1792,7 @@ class AceStepHandler:
         shift: float = 1.0,
         audio_code_hints: Optional[Union[str, List[str]]] = None,
         infer_method: str = "ode",
     ) -> Dict[str, Any]:
         """
@@ -1949,6 +1956,9 @@ class AceStepHandler:
             "cfg_interval_end": cfg_interval_end,
             "shift": shift,
         }
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             # Prepare condition tensors first (for LRC timestamp generation)
@@ -2081,6 +2091,7 @@ class AceStepHandler:
         shift: float = 1.0,
         infer_method: str = "ode",
         use_tiled_decode: bool = True,
         progress=None
     ) -> Dict[str, Any]:
         """
@@ -2230,7 +2241,8 @@ class AceStepHandler:
                 shift=shift,  # Pass shift parameter
                 infer_method=infer_method,  # Pass infer method (ode or sde)
                 audio_code_hints=audio_code_hints_batch,  # Pass audio code hints as list
-                return_intermediate=should_return_intermediate
             )
             logger.info("[generate_music] Model generation completed. Decoding latents...")

         except ImportError:
             return False
+    def is_turbo_model(self) -> bool:
+        """Check if the currently loaded model is a turbo model"""
+        if self.config is None:
+            return False
+        return getattr(self.config, 'is_turbo', False)
     def initialize_service(
         self,
         project_root: str,
         shift: float = 1.0,
         audio_code_hints: Optional[Union[str, List[str]]] = None,
         infer_method: str = "ode",
+        timesteps: Optional[List[float]] = None,
     ) -> Dict[str, Any]:
         """
             "cfg_interval_end": cfg_interval_end,
             "shift": shift,
         }
+        # Add custom timesteps if provided (convert to tensor)
+        if timesteps is not None:
+            generate_kwargs["timesteps"] = torch.tensor(timesteps, dtype=torch.float32)
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             # Prepare condition tensors first (for LRC timestamp generation)
         shift: float = 1.0,
         infer_method: str = "ode",
         use_tiled_decode: bool = True,
+        timesteps: Optional[List[float]] = None,
         progress=None
     ) -> Dict[str, Any]:
         """
                 shift=shift,  # Pass shift parameter
                 infer_method=infer_method,  # Pass infer method (ode or sde)
                 audio_code_hints=audio_code_hints_batch,  # Pass audio code hints as list
+                return_intermediate=should_return_intermediate,
+                timesteps=timesteps,  # Pass custom timesteps if provided
             )
             logger.info("[generate_music] Model generation completed. Decoding latents...")

acestep/inference.py CHANGED Viewed

@@ -97,6 +97,9 @@ class GenerationParams:
     cfg_interval_end: float = 1.0
     shift: float = 1.0
     infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
     repainting_start: float = 0.0
     repainting_end: float = -1
@@ -534,6 +537,7 @@ def generate_music(
             cfg_interval_end=params.cfg_interval_end,
             shift=params.shift,
             infer_method=params.infer_method,
             progress=progress,
         )

     cfg_interval_end: float = 1.0
     shift: float = 1.0
     infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    # Custom timesteps (parsed from string like "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+    # If provided, overrides inference_steps and shift
+    timesteps: Optional[List[float]] = None
     repainting_start: float = 0.0
     repainting_end: float = -1
             cfg_interval_end=params.cfg_interval_end,
             shift=params.shift,
             infer_method=params.infer_method,
+            timesteps=params.timesteps,
             progress=progress,
         )