Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Dec 23, 2025

Commit

376c43e

1 Parent(s): 5ac3586

refact handler

Browse files

Files changed (9) hide show

.gitignore +2 -0
acestep/acestep_v15_pipeline.py +132 -7
acestep/gradio_ui.py +161 -55
acestep/handler.py +3 -2
acestep/llm_inference.py +234 -12
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +100 -9
acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py +3 -0
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +60 -1
acestep/third_parts/nano-vllm/nanovllm/sampling_params.py +9 -0

.gitignore CHANGED Viewed

@@ -213,3 +213,5 @@ tests/
 checkpoints/
 playground.ipynb
 .history/

 checkpoints/
 playground.ipynb
 .history/
+upload_checkpoints.sh
+checkpoints.7z

acestep/acestep_v15_pipeline.py CHANGED Viewed

@@ -15,20 +15,33 @@ from .dataset_handler import DatasetHandler
 from .gradio_ui import create_gradio_interface
-def create_demo():
     """
     Create Gradio demo interface
     Returns:
         Gradio Blocks instance
     """
-    # Create independent handler instances
-    dit_handler = AceStepHandler()  # DiT handler
-    llm_handler = LLMHandler()      # LM handler
     dataset_handler = DatasetHandler()  # Dataset handler
-    # Create Gradio interface with all handlers
-    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler)
     return demo
@@ -42,12 +55,124 @@ def main():
     parser.add_argument("--share", action="store_true", help="Create a public link")
     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
     args = parser.parse_args()
     try:
         # Create and launch demo
         print("Creating Gradio interface...")
-        demo = create_demo()
         print(f"Launching server on {args.server_name}:{args.port}...")
         demo.launch(
             server_name=args.server_name,

 from .gradio_ui import create_gradio_interface
+def create_demo(init_params=None):
     """
     Create Gradio demo interface
+    Args:
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+                    Keys: 'pre_initialized' (bool), 'checkpoint', 'config_path', 'device',
+                          'init_llm', 'lm_model_path', 'backend', 'use_flash_attention',
+                          'offload_to_cpu', 'offload_dit_to_cpu', 'init_status',
+                          'dit_handler', 'llm_handler' (initialized handlers if pre-initialized)
     Returns:
         Gradio Blocks instance
     """
+    # Use pre-initialized handlers if available, otherwise create new ones
+    if init_params and init_params.get('pre_initialized') and 'dit_handler' in init_params:
+        dit_handler = init_params['dit_handler']
+        llm_handler = init_params['llm_handler']
+    else:
+        dit_handler = AceStepHandler()  # DiT handler
+        llm_handler = LLMHandler()      # LM handler
     dataset_handler = DatasetHandler()  # Dataset handler
+    # Create Gradio interface with all handlers and initialization parameters
+    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=init_params)
     return demo
     parser.add_argument("--share", action="store_true", help="Create a public link")
     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
+    # Service initialization arguments
+    parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
+    parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
+    parser.add_argument("--config_path", type=str, default=None, help="Main model path (e.g., 'acestep-v15-turbo')")
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"], help="Processing device (default: auto)")
+    parser.add_argument("--init_llm", type=lambda x: x.lower() in ['true', '1', 'yes'], default=True, help="Initialize 5Hz LM (default: True)")
+    parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
+    parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "pt"], help="5Hz LM backend (default: vllm)")
+    parser.add_argument("--use_flash_attention", type=lambda x: x.lower() in ['true', '1', 'yes'], default=None, help="Use flash attention (default: auto-detect)")
+    parser.add_argument("--offload_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload models to CPU (default: False)")
+    parser.add_argument("--offload_dit_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload DiT to CPU (default: False)")
     args = parser.parse_args()
     try:
+        init_params = None
+        # If init_service is True, perform initialization before creating UI
+        if args.init_service:
+            print("Initializing service from command line...")
+            # Create handler instances for initialization
+            dit_handler = AceStepHandler()
+            llm_handler = LLMHandler()
+            # Auto-select config_path if not provided
+            if args.config_path is None:
+                available_models = dit_handler.get_available_acestep_v15_models()
+                if available_models:
+                    args.config_path = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else available_models[0]
+                    print(f"Auto-selected config_path: {args.config_path}")
+                else:
+                    print("Error: No available models found. Please specify --config_path", file=sys.stderr)
+                    sys.exit(1)
+            # Get project root (same logic as in handler)
+            current_file = os.path.abspath(__file__)
+            project_root = os.path.dirname(os.path.dirname(current_file))
+            # Determine flash attention setting
+            use_flash_attention = args.use_flash_attention
+            if use_flash_attention is None:
+                use_flash_attention = dit_handler.is_flash_attention_available()
+            # Initialize DiT handler
+            print(f"Initializing DiT model: {args.config_path} on {args.device}...")
+            init_status, enable_generate = dit_handler.initialize_service(
+                project_root=project_root,
+                config_path=args.config_path,
+                device=args.device,
+                use_flash_attention=use_flash_attention,
+                compile_model=False,
+                offload_to_cpu=args.offload_to_cpu,
+                offload_dit_to_cpu=args.offload_dit_to_cpu
+            )
+            if not enable_generate:
+                print(f"Error initializing DiT model: {init_status}", file=sys.stderr)
+                sys.exit(1)
+            print(f"DiT model initialized successfully")
+            # Initialize LM handler if requested
+            lm_status = ""
+            if args.init_llm:
+                if args.lm_model_path is None:
+                    # Try to get default LM model
+                    available_lm_models = llm_handler.get_available_5hz_lm_models()
+                    if available_lm_models:
+                        args.lm_model_path = available_lm_models[0]
+                        print(f"Using default LM model: {args.lm_model_path}")
+                    else:
+                        print("Warning: No LM models available, skipping LM initialization", file=sys.stderr)
+                        args.init_llm = False
+                if args.init_llm and args.lm_model_path:
+                    checkpoint_dir = os.path.join(project_root, "checkpoints")
+                    print(f"Initializing 5Hz LM: {args.lm_model_path} on {args.device}...")
+                    lm_status, lm_success = llm_handler.initialize(
+                        checkpoint_dir=checkpoint_dir,
+                        lm_model_path=args.lm_model_path,
+                        backend=args.backend,
+                        device=args.device,
+                        offload_to_cpu=args.offload_to_cpu,
+                        dtype=dit_handler.dtype
+                    )
+                    if lm_success:
+                        print(f"5Hz LM initialized successfully")
+                        init_status += f"\n{lm_status}"
+                    else:
+                        print(f"Warning: 5Hz LM initialization failed: {lm_status}", file=sys.stderr)
+                        init_status += f"\n{lm_status}"
+            # Prepare initialization parameters for UI
+            init_params = {
+                'pre_initialized': True,
+                'checkpoint': args.checkpoint,
+                'config_path': args.config_path,
+                'device': args.device,
+                'init_llm': args.init_llm,
+                'lm_model_path': args.lm_model_path,
+                'backend': args.backend,
+                'use_flash_attention': use_flash_attention,
+                'offload_to_cpu': args.offload_to_cpu,
+                'offload_dit_to_cpu': args.offload_dit_to_cpu,
+                'init_status': init_status,
+                'enable_generate': enable_generate,
+                'dit_handler': dit_handler,
+                'llm_handler': llm_handler
+            }
+            print("Service initialization completed successfully!")
         # Create and launch demo
         print("Creating Gradio interface...")
+        demo = create_demo(init_params=init_params)
         print(f"Launching server on {args.server_name}:{args.port}...")
         demo.launch(
             server_name=args.server_name,

acestep/gradio_ui.py CHANGED Viewed

@@ -7,7 +7,7 @@ import gradio as gr
 from typing import Callable, Optional
-def create_gradio_interface(dit_handler, llm_handler, dataset_handler) -> gr.Blocks:
     """
     Create Gradio interface
@@ -15,6 +15,8 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler) -> gr.Blo
         dit_handler: DiT handler instance
         llm_handler: LM handler instance
         dataset_handler: Dataset handler instance
     Returns:
         Gradio Blocks instance
@@ -47,8 +49,8 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler) -> gr.Blo
         # Dataset Explorer Section
         dataset_section = create_dataset_section(dataset_handler)
-        # Generation Section
-        generation_section = create_generation_section(dit_handler, llm_handler)
         # Results Section
         results_section = create_results_section(dit_handler)
@@ -156,20 +158,33 @@ def create_dataset_section(dataset_handler) -> dict:
     }
-def create_generation_section(dit_handler, llm_handler) -> dict:
-    """Create generation section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎼 ACE-Step V1.5 Demo </h3></div>')
-        # Service Configuration
-        with gr.Accordion("🔧 Service Configuration", open=True) as service_config_accordion:
             # Dropdown options section - all dropdowns grouped together
             with gr.Row(equal_height=True):
                 with gr.Column(scale=4):
                     checkpoint_dropdown = gr.Dropdown(
                         label="Checkpoint File",
                         choices=dit_handler.get_available_checkpoints(),
-                        value=None,
                         info="Select a trained model checkpoint file (full path or filename)"
                     )
                 with gr.Column(scale=1, min_width=90):
@@ -180,15 +195,19 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                 available_models = dit_handler.get_available_acestep_v15_models()
                 default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
                 config_path = gr.Dropdown(
                     label="Main Model Path",
                     choices=available_models,
-                    value=default_model,
                     info="Select the model configuration directory (auto-scanned from checkpoints)"
                 )
                 device = gr.Dropdown(
                     choices=["auto", "cuda", "cpu"],
-                    value="auto",
                     label="Device",
                     info="Processing device (auto-detect recommended)"
                 )
@@ -198,47 +217,61 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                 available_lm_models = llm_handler.get_available_5hz_lm_models()
                 default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
                 lm_model_path = gr.Dropdown(
                     label="5Hz LM Model Path",
                     choices=available_lm_models,
-                    value=default_lm_model,
                     info="Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)"
                 )
                 backend_dropdown = gr.Dropdown(
                     choices=["vllm", "pt"],
-                    value="vllm",
                     label="5Hz LM Backend",
                     info="Select backend for 5Hz LM: vllm (faster) or pt (PyTorch, more compatible)"
                 )
             # Checkbox options section - all checkboxes grouped together
             with gr.Row():
                 init_llm_checkbox = gr.Checkbox(
                     label="Initialize 5Hz LM",
-                    value=False,
                     info="Check to initialize 5Hz LM during service initialization",
                 )
                 # Auto-detect flash attention availability
                 flash_attn_available = dit_handler.is_flash_attention_available()
                 use_flash_attention_checkbox = gr.Checkbox(
                     label="Use Flash Attention",
-                    value=flash_attn_available,
                     interactive=flash_attn_available,
                     info="Enable flash attention for faster inference (requires flash_attn package)" if flash_attn_available else "Flash attention not available (flash_attn package not installed)"
                 )
                 offload_to_cpu_checkbox = gr.Checkbox(
                     label="Offload to CPU",
-                    value=False,
                     info="Offload models to CPU when not in use to save GPU memory"
                 )
                 offload_dit_to_cpu_checkbox = gr.Checkbox(
                     label="Offload DiT to CPU",
-                    value=False,
                     info="Offload DiT to CPU (needs Offload to CPU)"
                 )
             init_btn = gr.Button("Initialize Service", variant="primary", size="lg")
-            init_status = gr.Textbox(label="Status", interactive=False, lines=3)
         # Inputs
         with gr.Row():
@@ -328,7 +361,7 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                             label="Temperature",
                             minimum=0.0,
                             maximum=2.0,
-                            value=0.7,
                             step=0.1,
                             scale=1,
                             info="Temperature for 5Hz LM sampling (higher = more random, lower = more deterministic)"
@@ -337,18 +370,48 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                             label="CFG Scale",
                             minimum=1.0,
                             maximum=3.0,
-                            value=1.0,
                             step=0.1,
                             scale=1,
                             info="Classifier-Free Guidance scale for 5Hz LM (1.0 = no CFG, higher = stronger guidance)"
                         )
                     # Negative prompt for CFG (only visible when LM initialized and cfg_scale > 1)
                     lm_negative_prompt = gr.Textbox(
                         label="Negative Prompt",
                         value="NO USER INPUT",
                         placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
-                        visible=False,
                         info="Negative prompt used for Classifier-Free Guidance when CFG Scale > 1.0",
                         lines=2
                     )
@@ -377,7 +440,7 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                         step=0.01,
                         label="Audio Cover Strength",
                         info="Control how many denoising steps use cover mode",
-                        visible=False
                     )
                 # Music Caption
@@ -514,7 +577,9 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
                     interactive=False
                 )
-        generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=False)
     return {
         "checkpoint_dropdown": checkpoint_dropdown,
@@ -542,6 +607,9 @@ def create_generation_section(dit_handler, llm_handler) -> dict:
         "use_5hz_lm_btn": use_5hz_lm_btn,
         "lm_temperature": lm_temperature,
         "lm_cfg_scale": lm_cfg_scale,
         "lm_negative_prompt": lm_negative_prompt,
         "repainting_group": repainting_group,
         "repainting_start": repainting_start,
@@ -733,6 +801,47 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         return status, gr.update(interactive=enable)
     generation_section["init_btn"].click(
         fn=init_service_wrapper,
         inputs=[
@@ -749,30 +858,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         outputs=[generation_section["init_status"], generation_section["generate_btn"]]
     )
-    # Update negative prompt visibility based on LM initialization and CFG scale
-    def update_negative_prompt_visibility(init_status, cfg_scale):
-        """Update negative prompt visibility: show only if LM initialized and cfg_scale > 1"""
-        # Check if LM is initialized by looking for "5Hz LM backend:" in status
-        lm_initialized = init_status is not None and "5Hz LM backend:" in str(init_status)
-        # Check if cfg_scale > 1
-        cfg_enabled = cfg_scale is not None and float(cfg_scale) > 1.0
-        # Show only if both conditions are met
-        return gr.update(visible=lm_initialized and cfg_enabled)
-    # Update visibility when init_status changes
-    generation_section["init_status"].change(
-        fn=update_negative_prompt_visibility,
-        inputs=[generation_section["init_status"], generation_section["lm_cfg_scale"]],
-        outputs=[generation_section["lm_negative_prompt"]]
-    )
-    # Update visibility when cfg_scale changes
-    generation_section["lm_cfg_scale"].change(
-        fn=update_negative_prompt_visibility,
-        inputs=[generation_section["init_status"], generation_section["lm_cfg_scale"]],
-        outputs=[generation_section["lm_negative_prompt"]]
-    )
     # Generation with progress bar
     def generate_with_progress(
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
@@ -845,9 +930,16 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     )
     # 5Hz LM generation (simplified version, can be extended as needed)
-    def generate_lm_hints_wrapper(caption, lyrics, temperature, cfg_scale, negative_prompt):
         """Wrapper for 5Hz LM generation"""
-        metadata, audio_codes, status = llm_handler.generate_with_5hz_lm(caption, lyrics, temperature, cfg_scale, negative_prompt)
         # Extract metadata values and map to UI fields
         # Handle bpm
@@ -886,6 +978,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["lyrics"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
             generation_section["lm_negative_prompt"]
         ],
         outputs=[
@@ -902,7 +997,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         task_type_value: str,
         track_name_value: Optional[str],
         complete_track_classes_value: list,
-        audio_codes_content: str = ""
     ) -> tuple:
         """Update instruction and UI visibility based on task type."""
         instruction = dit_handler.generate_instruction(
@@ -915,8 +1011,15 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         track_name_visible = task_type_value in ["lego", "extract"]
         # Show complete_track_classes for complete
         complete_visible = task_type_value == "complete"
-        # Show audio_cover_strength for cover
-        audio_cover_strength_visible = task_type_value == "cover"
         # Show audio_code_string for cover
         audio_code_visible = task_type_value == "cover"
         # Show repainting controls for repaint and lego
@@ -932,7 +1035,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             instruction,  # instruction_display_gen
             gr.update(visible=track_name_visible),  # track_name
             gr.update(visible=complete_visible),  # complete_track_classes
-            gr.update(visible=audio_cover_strength_visible),  # audio_cover_strength
             gr.update(visible=repainting_visible),  # repainting_group
             gr.update(visible=audio_code_visible),  # audio_code_string
             gr.update(visible=use_5hz_lm_visible),  # use_5hz_lm_row
@@ -946,7 +1049,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
-            generation_section["text2music_audio_code_string"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],
@@ -967,7 +1071,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
-            generation_section["text2music_audio_code_string"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],
@@ -988,7 +1093,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
-            generation_section["text2music_audio_code_string"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],

 from typing import Callable, Optional
+def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None) -> gr.Blocks:
     """
     Create Gradio interface
         dit_handler: DiT handler instance
         llm_handler: LM handler instance
         dataset_handler: Dataset handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
     Returns:
         Gradio Blocks instance
         # Dataset Explorer Section
         dataset_section = create_dataset_section(dataset_handler)
+        # Generation Section (pass init_params to support pre-initialization)
+        generation_section = create_generation_section(dit_handler, llm_handler, init_params=init_params)
         # Results Section
         results_section = create_results_section(dit_handler)
     }
+def create_generation_section(dit_handler, llm_handler, init_params=None) -> dict:
+    """Create generation section
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+    """
+    # Check if service is pre-initialized
+    service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎼 ACE-Step V1.5 Demo </h3></div>')
+        # Service Configuration - collapse if pre-initialized
+        accordion_open = not service_pre_initialized
+        with gr.Accordion("🔧 Service Configuration", open=accordion_open) as service_config_accordion:
             # Dropdown options section - all dropdowns grouped together
             with gr.Row(equal_height=True):
                 with gr.Column(scale=4):
+                    # Set checkpoint value from init_params if pre-initialized
+                    checkpoint_value = init_params.get('checkpoint') if service_pre_initialized else None
                     checkpoint_dropdown = gr.Dropdown(
                         label="Checkpoint File",
                         choices=dit_handler.get_available_checkpoints(),
+                        value=checkpoint_value,
                         info="Select a trained model checkpoint file (full path or filename)"
                     )
                 with gr.Column(scale=1, min_width=90):
                 available_models = dit_handler.get_available_acestep_v15_models()
                 default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
+                # Set config_path value from init_params if pre-initialized
+                config_path_value = init_params.get('config_path', default_model) if service_pre_initialized else default_model
                 config_path = gr.Dropdown(
                     label="Main Model Path",
                     choices=available_models,
+                    value=config_path_value,
                     info="Select the model configuration directory (auto-scanned from checkpoints)"
                 )
+                # Set device value from init_params if pre-initialized
+                device_value = init_params.get('device', 'auto') if service_pre_initialized else 'auto'
                 device = gr.Dropdown(
                     choices=["auto", "cuda", "cpu"],
+                    value=device_value,
                     label="Device",
                     info="Processing device (auto-detect recommended)"
                 )
                 available_lm_models = llm_handler.get_available_5hz_lm_models()
                 default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                # Set lm_model_path value from init_params if pre-initialized
+                lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model
                 lm_model_path = gr.Dropdown(
                     label="5Hz LM Model Path",
                     choices=available_lm_models,
+                    value=lm_model_path_value,
                     info="Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)"
                 )
+                # Set backend value from init_params if pre-initialized
+                backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm'
                 backend_dropdown = gr.Dropdown(
                     choices=["vllm", "pt"],
+                    value=backend_value,
                     label="5Hz LM Backend",
                     info="Select backend for 5Hz LM: vllm (faster) or pt (PyTorch, more compatible)"
                 )
             # Checkbox options section - all checkboxes grouped together
             with gr.Row():
+                # Set init_llm value from init_params if pre-initialized
+                init_llm_value = init_params.get('init_llm', True) if service_pre_initialized else True
                 init_llm_checkbox = gr.Checkbox(
                     label="Initialize 5Hz LM",
+                    value=init_llm_value,
                     info="Check to initialize 5Hz LM during service initialization",
                 )
                 # Auto-detect flash attention availability
                 flash_attn_available = dit_handler.is_flash_attention_available()
+                # Set use_flash_attention value from init_params if pre-initialized
+                use_flash_attention_value = init_params.get('use_flash_attention', flash_attn_available) if service_pre_initialized else flash_attn_available
                 use_flash_attention_checkbox = gr.Checkbox(
                     label="Use Flash Attention",
+                    value=use_flash_attention_value,
                     interactive=flash_attn_available,
                     info="Enable flash attention for faster inference (requires flash_attn package)" if flash_attn_available else "Flash attention not available (flash_attn package not installed)"
                 )
+                # Set offload_to_cpu value from init_params if pre-initialized
+                offload_to_cpu_value = init_params.get('offload_to_cpu', False) if service_pre_initialized else False
                 offload_to_cpu_checkbox = gr.Checkbox(
                     label="Offload to CPU",
+                    value=offload_to_cpu_value,
                     info="Offload models to CPU when not in use to save GPU memory"
                 )
+                # Set offload_dit_to_cpu value from init_params if pre-initialized
+                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', False) if service_pre_initialized else False
                 offload_dit_to_cpu_checkbox = gr.Checkbox(
                     label="Offload DiT to CPU",
+                    value=offload_dit_to_cpu_value,
                     info="Offload DiT to CPU (needs Offload to CPU)"
                 )
             init_btn = gr.Button("Initialize Service", variant="primary", size="lg")
+            # Set init_status value from init_params if pre-initialized
+            init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
+            init_status = gr.Textbox(label="Status", interactive=False, lines=3, value=init_status_value)
         # Inputs
         with gr.Row():
                             label="Temperature",
                             minimum=0.0,
                             maximum=2.0,
+                            value=0.85,
                             step=0.1,
                             scale=1,
                             info="Temperature for 5Hz LM sampling (higher = more random, lower = more deterministic)"
                             label="CFG Scale",
                             minimum=1.0,
                             maximum=3.0,
+                            value=2.0,
                             step=0.1,
                             scale=1,
                             info="Classifier-Free Guidance scale for 5Hz LM (1.0 = no CFG, higher = stronger guidance)"
                         )
+                    with gr.Row():
+                        lm_top_k = gr.Slider(
+                            label="Top-K",
+                            minimum=0,
+                            maximum=100,
+                            value=0,
+                            step=1,
+                            scale=1,
+                            info="Top-K sampling: consider only top K tokens (0 = disabled)"
+                        )
+                        lm_top_p = gr.Slider(
+                            label="Top-P",
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=0.9,
+                            step=0.01,
+                            scale=1,
+                            info="Top-P (nucleus) sampling: cumulative probability threshold (1.0 = disabled)"
+                        )
+                        lm_repetition_penalty = gr.Slider(
+                            label="Repetition Penalty",
+                            minimum=0.8,
+                            maximum=1.2,
+                            value=1.0,
+                            step=0.01,
+                            scale=1,
+                            info="Repetition penalty: >1.0 reduces repetition, <1.0 increases it (1.0 = no penalty). For audio generation, use 1.0 or very small values (1.01-1.05) as audio tokens naturally repeat.",
+                            visible=False,
+                        )
                     # Negative prompt for CFG (only visible when LM initialized and cfg_scale > 1)
                     lm_negative_prompt = gr.Textbox(
                         label="Negative Prompt",
                         value="NO USER INPUT",
                         placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
+                        visible=True,
                         info="Negative prompt used for Classifier-Free Guidance when CFG Scale > 1.0",
                         lines=2
                     )
                         step=0.01,
                         label="Audio Cover Strength",
                         info="Control how many denoising steps use cover mode",
+                        visible=True
                     )
                 # Music Caption
                     interactive=False
                 )
+        # Set generate_btn to interactive if service is pre-initialized
+        generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
+        generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive)
     return {
         "checkpoint_dropdown": checkpoint_dropdown,
         "use_5hz_lm_btn": use_5hz_lm_btn,
         "lm_temperature": lm_temperature,
         "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_repetition_penalty": lm_repetition_penalty,
         "lm_negative_prompt": lm_negative_prompt,
         "repainting_group": repainting_group,
         "repainting_start": repainting_start,
         return status, gr.update(interactive=enable)
+    # Update negative prompt visibility based on "Initialize 5Hz LM" checkbox
+    def update_negative_prompt_visibility(init_llm_checked):
+        """Update negative prompt visibility: show if Initialize 5Hz LM checkbox is checked"""
+        return gr.update(visible=init_llm_checked)
+    # Update audio_cover_strength visibility and label based on task type and LM initialization
+    def update_audio_cover_strength_visibility(task_type_value, init_llm_checked):
+        """Update audio_cover_strength visibility and label"""
+        # Show if task is cover OR if LM is initialized
+        is_visible = (task_type_value == "cover") or init_llm_checked
+        # Change label based on context
+        if init_llm_checked and task_type_value != "cover":
+            label = "LM codes strength"
+            info = "Control how many denoising steps use LM-generated codes"
+        else:
+            label = "Audio Cover Strength"
+            info = "Control how many denoising steps use cover mode"
+        return gr.update(visible=is_visible, label=label, info=info)
+    # Update visibility when init_llm_checkbox changes
+    generation_section["init_llm_checkbox"].change(
+        fn=update_negative_prompt_visibility,
+        inputs=[generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["lm_negative_prompt"]]
+    )
+    # Update audio_cover_strength visibility and label when init_llm_checkbox changes
+    generation_section["init_llm_checkbox"].change(
+        fn=update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    # Also update audio_cover_strength when task_type changes (to handle label changes)
+    generation_section["task_type"].change(
+        fn=update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
     generation_section["init_btn"].click(
         fn=init_service_wrapper,
         inputs=[
         outputs=[generation_section["init_status"], generation_section["generate_btn"]]
     )
     # Generation with progress bar
     def generate_with_progress(
         captions, lyrics, bpm, key_scale, time_signature, vocal_language,
     )
     # 5Hz LM generation (simplified version, can be extended as needed)
+    def generate_lm_hints_wrapper(caption, lyrics, temperature, cfg_scale, top_k, top_p, repetition_penalty, negative_prompt):
         """Wrapper for 5Hz LM generation"""
+        # Convert top_k: 0 means None (disabled)
+        top_k_value = None if top_k == 0 else int(top_k)
+        # Convert top_p: 1.0 means None (disabled)
+        top_p_value = None if top_p >= 1.0 else top_p
+        metadata, audio_codes, status = llm_handler.generate_with_5hz_lm(
+            caption, lyrics, temperature, cfg_scale, negative_prompt,
+            top_k_value, top_p_value, repetition_penalty
+        )
         # Extract metadata values and map to UI fields
         # Handle bpm
             generation_section["lyrics"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_repetition_penalty"],
             generation_section["lm_negative_prompt"]
         ],
         outputs=[
         task_type_value: str,
         track_name_value: Optional[str],
         complete_track_classes_value: list,
+        audio_codes_content: str = "",
+        init_llm_checked: bool = False
     ) -> tuple:
         """Update instruction and UI visibility based on task type."""
         instruction = dit_handler.generate_instruction(
         track_name_visible = task_type_value in ["lego", "extract"]
         # Show complete_track_classes for complete
         complete_visible = task_type_value == "complete"
+        # Show audio_cover_strength for cover OR when LM is initialized
+        audio_cover_strength_visible = (task_type_value == "cover") or init_llm_checked
+        # Determine label and info based on context
+        if init_llm_checked and task_type_value != "cover":
+            audio_cover_strength_label = "LM codes strength"
+            audio_cover_strength_info = "Control how many denoising steps use LM-generated codes"
+        else:
+            audio_cover_strength_label = "Audio Cover Strength"
+            audio_cover_strength_info = "Control how many denoising steps use cover mode"
         # Show audio_code_string for cover
         audio_code_visible = task_type_value == "cover"
         # Show repainting controls for repaint and lego
             instruction,  # instruction_display_gen
             gr.update(visible=track_name_visible),  # track_name
             gr.update(visible=complete_visible),  # complete_track_classes
+            gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
             gr.update(visible=repainting_visible),  # repainting_group
             gr.update(visible=audio_code_visible),  # audio_code_string
             gr.update(visible=use_5hz_lm_visible),  # use_5hz_lm_row
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["init_llm_checkbox"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["init_llm_checkbox"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],
             generation_section["task_type"],
             generation_section["track_name"],
             generation_section["complete_track_classes"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["init_llm_checkbox"]
         ],
         outputs=[
             generation_section["instruction_display_gen"],

acestep/handler.py CHANGED Viewed

@@ -1362,7 +1362,7 @@ class AceStepHandler:
         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
-        if audio_cover_strength < 1.0 and is_covers is not None and is_covers.any():
             non_cover_text_input_ids = []
             non_cover_text_attention_masks = []
             for i in range(batch_size):
@@ -1381,8 +1381,9 @@ class AceStepHandler:
                     return_tensors="pt",
                 )
                 text_token_ids = text_inputs_dict.input_ids[0]
                 non_cover_text_input_ids.append(text_token_ids)
-                non_cover_text_attention_masks.append(text_attention_mask)
             padded_non_cover_text_input_ids = torch.stack([
                 torch.nn.functional.pad(

         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
+        if audio_cover_strength < 1.0:
             non_cover_text_input_ids = []
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                     return_tensors="pt",
                 )
                 text_token_ids = text_inputs_dict.input_ids[0]
+                non_cover_text_attention_mask = text_inputs_dict.attention_mask[0].bool()
                 non_cover_text_input_ids.append(text_token_ids)
+                non_cover_text_attention_masks.append(non_cover_text_attention_mask)
             padded_non_cover_text_input_ids = torch.stack([
                 torch.nn.functional.pad(

acestep/llm_inference.py CHANGED Viewed

@@ -11,8 +11,18 @@ from contextlib import contextmanager
 import torch
 from tqdm import tqdm
 from loguru import logger
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.generation.streamers import BaseStreamer
 class LLMHandler:
@@ -209,7 +219,17 @@ class LLMHandler:
             error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg
-    def generate_with_5hz_lm_vllm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with vllm backend"""
         try:
             from nanovllm import SamplingParams
@@ -226,7 +246,14 @@ class LLMHandler:
             )
             logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
-            sampling_params = SamplingParams(max_tokens=self.max_model_len-64, temperature=temperature, cfg_scale=cfg_scale)
             # Use CFG if cfg_scale > 1.0
             if cfg_scale > 1.0:
                 # Build unconditional prompt (user input replaced with "NO USER INPUT")
@@ -266,7 +293,17 @@ class LLMHandler:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
-    def generate_with_5hz_lm_pt(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with PyTorch backend"""
         try:
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
@@ -295,7 +332,7 @@ class LLMHandler:
                 # Get max_new_tokens from model config or use a default
                 max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
                 if hasattr(self, 'max_model_len'):
-                    max_new_tokens = min(max_new_tokens, self.max_model_len)
                 # Define custom streamer for tqdm
                 class TqdmTokenStreamer(BaseStreamer):
@@ -315,15 +352,78 @@ class LLMHandler:
                 streamer = TqdmTokenStreamer(total=max_new_tokens)
-                with torch.no_grad():
-                    outputs = self.llm.generate(
-                        **inputs,
                         max_new_tokens=max_new_tokens,
                         temperature=temperature,
-                        do_sample=True if temperature > 0 else False,
                         pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                         streamer=streamer,
                     )
             # Decode the generated tokens
             # Only decode the newly generated tokens (skip the input prompt)
@@ -338,7 +438,17 @@ class LLMHandler:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
-    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM"""
         # Check if 5Hz LM is initialized
         if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
@@ -355,9 +465,15 @@ class LLMHandler:
             return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
         if self.llm_backend == "vllm":
-            return self.generate_with_5hz_lm_vllm(caption, lyrics, temperature, cfg_scale, negative_prompt)
         else:
-            return self.generate_with_5hz_lm_pt(caption, lyrics, temperature)
     def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
         """
@@ -440,6 +556,112 @@ class LLMHandler:
         return metadata, audio_codes
     @contextmanager
     def _load_model_context(self):
         """

 import torch
 from tqdm import tqdm
 from loguru import logger
+from transformers import AutoTokenizer, AutoModelForCausalLM, ClassifierFreeGuidanceLogitsProcessor
 from transformers.generation.streamers import BaseStreamer
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    LogitsProcessor,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+)
 class LLMHandler:
             error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg
+    def generate_with_5hz_lm_vllm(
+        self,
+        caption: str,
+        lyrics: str,
+        temperature: float = 0.6,
+        cfg_scale: float = 1.0,
+        negative_prompt: str = "NO USER INPUT",
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+    ) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with vllm backend"""
         try:
             from nanovllm import SamplingParams
             )
             logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
+            sampling_params = SamplingParams(
+                max_tokens=self.max_model_len-64,
+                temperature=temperature,
+                cfg_scale=cfg_scale,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+            )
             # Use CFG if cfg_scale > 1.0
             if cfg_scale > 1.0:
                 # Build unconditional prompt (user input replaced with "NO USER INPUT")
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
+    def generate_with_5hz_lm_pt(
+        self,
+        caption: str,
+        lyrics: str,
+        temperature: float = 0.6,
+        cfg_scale: float = 1.0,
+        negative_prompt: str = "NO USER INPUT",
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+    ) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with PyTorch backend"""
         try:
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
                 # Get max_new_tokens from model config or use a default
                 max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
                 if hasattr(self, 'max_model_len'):
+                    max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
                 # Define custom streamer for tqdm
                 class TqdmTokenStreamer(BaseStreamer):
                 streamer = TqdmTokenStreamer(total=max_new_tokens)
+                # Build logits processor list
+                logits_processor = LogitsProcessorList()
+                # Add repetition penalty if needed
+                if repetition_penalty != 1.0:
+                    logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+                # Add temperature warper if needed (temperature is handled separately in generate, but we can also use warper)
+                # Note: temperature is passed directly to generate(), but we can use TemperatureLogitsWarper for consistency
+                if temperature != 1.0:
+                    logits_processor.append(TemperatureLogitsWarper(temperature=temperature))
+                # Add top-k warper if specified
+                if top_k is not None and top_k > 0:
+                    logits_processor.append(TopKLogitsWarper(top_k=top_k))
+                # Add top-p warper if specified
+                if top_p is not None and top_p > 0.0 and top_p < 1.0:
+                    logits_processor.append(TopPLogitsWarper(top_p=top_p))
+                # Handle CFG if cfg_scale > 1.0
+                if cfg_scale > 1.0:
+                    # Build unconditional prompt
+                    formatted_unconditional_prompt = self.llm_tokenizer.apply_chat_template(
+                        [
+                            {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                            {"role": "user", "content": negative_prompt}
+                        ],
+                        tokenize=False,
+                        add_generation_prompt=True,
+                    )
+                    # Tokenize unconditional prompt
+                    uncond_inputs = self.llm_tokenizer(
+                        formatted_unconditional_prompt,
+                        return_tensors="pt",
+                        padding=False,
+                        truncation=True,
+                    )
+                    uncond_inputs = {k: v.to(self.device) for k, v in uncond_inputs.items()}
+                    # Use custom CFG generation with batch processing
+                    # Combine conditional and unconditional inputs into a batch
+                    # Format: [cond_input, uncond_input]
+                    batch_input_ids = torch.cat([inputs['input_ids'], uncond_inputs['input_ids']], dim=0)
+                    batch_attention_mask = None
+                    if 'attention_mask' in inputs:
+                        batch_attention_mask = torch.cat([inputs['attention_mask'], uncond_inputs.get('attention_mask', torch.ones_like(uncond_inputs['input_ids']))], dim=0)
+                    # Custom CFG generation loop
+                    outputs = self._generate_with_cfg(
+                        batch_input_ids=batch_input_ids,
+                        batch_attention_mask=batch_attention_mask,
                         max_new_tokens=max_new_tokens,
                         temperature=temperature,
+                        cfg_scale=cfg_scale,
+                        logits_processor=logits_processor,
                         pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                         streamer=streamer,
                     )
+                else:
+                    # Generate without CFG
+                    with torch.no_grad():
+                        outputs = self.llm.generate(
+                            **inputs,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature if temperature > 0 else 1.0,
+                            do_sample=True if temperature > 0 else False,
+                            logits_processor=logits_processor if len(logits_processor) > 0 else None,
+                            pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                            streamer=streamer,
+                        )
             # Decode the generated tokens
             # Only decode the newly generated tokens (skip the input prompt)
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
+    def generate_with_5hz_lm(
+        self,
+        caption: str,
+        lyrics: str,
+        temperature: float = 0.6,
+        cfg_scale: float = 1.0,
+        negative_prompt: str = "NO USER INPUT",
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+    ) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM"""
         # Check if 5Hz LM is initialized
         if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
             return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
         if self.llm_backend == "vllm":
+            return self.generate_with_5hz_lm_vllm(
+                caption, lyrics, temperature, cfg_scale, negative_prompt,
+                top_k, top_p, repetition_penalty
+            )
         else:
+            return self.generate_with_5hz_lm_pt(
+                caption, lyrics, temperature, cfg_scale, negative_prompt,
+                top_k, top_p, repetition_penalty
+            )
     def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
         """
         return metadata, audio_codes
+    def _generate_with_cfg(
+        self,
+        batch_input_ids: torch.Tensor,
+        batch_attention_mask: Optional[torch.Tensor],
+        max_new_tokens: int,
+        temperature: float,
+        cfg_scale: float,
+        logits_processor: Optional[LogitsProcessorList],
+        pad_token_id: int,
+        streamer: Optional[BaseStreamer],
+    ) -> torch.Tensor:
+        """
+        Custom generation loop with CFG support using batch processing.
+        Batch format: [conditional_input, unconditional_input]
+        This properly utilizes KV cache by processing both sequences in parallel.
+        """
+        model = self.llm
+        device = self.device
+        batch_size = batch_input_ids.shape[0] // 2  # Half are conditional, half are unconditional
+        cond_start_idx = 0
+        uncond_start_idx = batch_size
+        # Initialize generated sequences
+        generated_ids = batch_input_ids.clone()
+        if batch_attention_mask is not None:
+            attention_mask = batch_attention_mask.clone()
+        else:
+            attention_mask = torch.ones_like(batch_input_ids)
+        # Prepare model inputs
+        model_kwargs = {}
+        if batch_attention_mask is not None:
+            model_kwargs['attention_mask'] = attention_mask
+        # Past key values for KV cache (if model supports it)
+        past_key_values = None
+        use_cache = hasattr(model, 'generation_config') and getattr(model.generation_config, 'use_cache', True)
+        with torch.no_grad():
+            for step in range(max_new_tokens):
+                # Forward pass for the entire batch (conditional + unconditional)
+                if past_key_values is None:
+                    # First step: full forward pass
+                    outputs = model(
+                        input_ids=generated_ids,
+                        **model_kwargs,
+                        use_cache=use_cache,
+                    )
+                else:
+                    # Subsequent steps: only forward the last token (utilizing KV cache)
+                    outputs = model(
+                        input_ids=generated_ids[:, -1:],
+                        past_key_values=past_key_values,
+                        **model_kwargs,
+                        use_cache=use_cache,
+                    )
+                # Get logits
+                next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
+                # Split conditional and unconditional logits
+                cond_logits = next_token_logits[cond_start_idx:cond_start_idx+batch_size]
+                uncond_logits = next_token_logits[uncond_start_idx:uncond_start_idx+batch_size]
+                # Apply CFG formula: logits_cfg = logits_uncond + cfg_scale * (logits_cond - logits_uncond)
+                cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+                # Apply logits processors (temperature, top-k, top-p, repetition penalty)
+                if logits_processor is not None:
+                    # Get current input_ids for repetition penalty (only conditional part)
+                    current_input_ids = generated_ids[cond_start_idx:cond_start_idx+batch_size]
+                    for processor in logits_processor:
+                        cfg_logits = processor(current_input_ids, cfg_logits)
+                # Apply temperature and sample
+                if temperature > 0:
+                    cfg_logits = cfg_logits / temperature
+                    probs = torch.softmax(cfg_logits, dim=-1)
+                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    next_tokens = torch.argmax(cfg_logits, dim=-1)
+                # Update generated sequences (apply same token to both conditional and unconditional)
+                next_tokens = next_tokens.unsqueeze(1)
+                generated_ids = torch.cat([generated_ids, next_tokens.repeat(2, 1)], dim=1)
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size*2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
+                model_kwargs['attention_mask'] = attention_mask
+                # Update past_key_values for next iteration
+                if use_cache and hasattr(outputs, 'past_key_values'):
+                    past_key_values = outputs.past_key_values
+                # Update streamer
+                if streamer is not None:
+                    streamer.put(next_tokens[0])  # Only stream conditional tokens
+                # Check for EOS (simplified - you may want to check model's eos_token_id)
+                if (next_tokens[0] == pad_token_id).all():
+                    break
+        if streamer is not None:
+            streamer.end()
+        # Return only conditional output
+        return generated_ids[cond_start_idx:cond_start_idx+batch_size]
     @contextmanager
     def _load_model_context(self):
         """

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -212,22 +212,37 @@ class ModelRunner:
         """Prepare sampling parameters. For CFG batch, only return parameters for conditional sequences."""
         if is_cfg_batch:
             # For CFG batch, seqs contains [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
-            # We only need temperatures for conditional sequences (first half)
             num_cond = len(seqs) // 2
             temperatures = []
             cfg_scales = []
             for seq in seqs[:num_cond]:
                 temperatures.append(seq.temperature)
                 cfg_scales.append(seq.cfg_scale)
         else:
             temperatures = []
             cfg_scales = []
             for seq in seqs:
                 temperatures.append(seq.temperature)
                 cfg_scales.append(seq.cfg_scale)
         temperatures = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
         cfg_scales = torch.tensor(cfg_scales, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        return temperatures, cfg_scales
     @torch.inference_mode()
     def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
@@ -274,7 +289,11 @@ class ModelRunner:
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
             input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
                                    else self.prepare_decode(seqs))
-            temperatures, cfg_scales = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else (None, None)
             # Run model forward (processes entire batch: cond + uncond)
             logits_all = self.run_model(input_ids, positions, is_prefill)
@@ -285,12 +304,44 @@ class ModelRunner:
                 logits_cond = logits_all[:num_cond]
                 logits_uncond = logits_all[num_cond:]
-                # Apply CFG formula: logits_cfg = logits_cond + cfg_scale * (logits_cond - logits_uncond)
                 cfg_scales_tensor = cfg_scales.unsqueeze(1)  # [num_cond, 1]
-                logits_cfg = logits_cond + cfg_scales_tensor * (logits_cond - logits_uncond)
                 # Sample from CFG logits
-                token_ids_cfg = self.sampler(logits_cfg, temperatures).tolist()
                 # Return token_ids (will be applied to both conditional and unconditional sequences)
                 return token_ids_cfg
@@ -300,11 +351,51 @@ class ModelRunner:
             # Normal batch (non-CFG)
             input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
                                    else self.prepare_decode(seqs))
-            temperatures, cfg_scales = self.prepare_sample(seqs, is_cfg_batch=False) if self.rank == 0 else (None, None)
             logits = self.run_model(input_ids, positions, is_prefill)
             reset_context()
-            token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None
-            return token_ids
     @torch.inference_mode()
     def capture_cudagraph(self):

         """Prepare sampling parameters. For CFG batch, only return parameters for conditional sequences."""
         if is_cfg_batch:
             # For CFG batch, seqs contains [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            # We only need parameters for conditional sequences (first half)
             num_cond = len(seqs) // 2
             temperatures = []
             cfg_scales = []
+            top_ks = []
+            top_ps = []
+            repetition_penalties = []
             for seq in seqs[:num_cond]:
                 temperatures.append(seq.temperature)
                 cfg_scales.append(seq.cfg_scale)
+                top_ks.append(seq.top_k if seq.top_k is not None else 0)
+                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
+                repetition_penalties.append(seq.repetition_penalty)
         else:
             temperatures = []
             cfg_scales = []
+            top_ks = []
+            top_ps = []
+            repetition_penalties = []
             for seq in seqs:
                 temperatures.append(seq.temperature)
                 cfg_scales.append(seq.cfg_scale)
+                top_ks.append(seq.top_k if seq.top_k is not None else 0)
+                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
+                repetition_penalties.append(seq.repetition_penalty)
         temperatures = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
         cfg_scales = torch.tensor(cfg_scales, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        top_ks = torch.tensor(top_ks, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        top_ps = torch.tensor(top_ps, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        repetition_penalties = torch.tensor(repetition_penalties, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
     @torch.inference_mode()
     def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
             # Prepare inputs for both conditional and unconditional (they're already in the batch)
             input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
                                    else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
             # Run model forward (processes entire batch: cond + uncond)
             logits_all = self.run_model(input_ids, positions, is_prefill)
                 logits_cond = logits_all[:num_cond]
                 logits_uncond = logits_all[num_cond:]
+                # Apply repetition penalty to conditional logits (before CFG)
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(cond_seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits_cond.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits_cond.shape[1], dtype=torch.bool, device=logits_cond.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits_cond[i] < 0,
+                                    logits_cond[i] * penalty,
+                                    logits_cond[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits_cond[i] = torch.where(token_mask, penalty_scores, logits_cond[i])
+                # Apply CFG formula: logits_cfg = logits_uncond + cfg_scale * (logits_cond - logits_uncond)
                 cfg_scales_tensor = cfg_scales.unsqueeze(1)  # [num_cond, 1]
+                logits_cfg = logits_uncond + cfg_scales_tensor * (logits_cond - logits_uncond)
+                # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
+                cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
                 # Sample from CFG logits
+                token_ids_cfg = self.sampler(
+                    logits_cfg,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    input_ids=cond_input_ids,
+                ).tolist()
                 # Return token_ids (will be applied to both conditional and unconditional sequences)
                 return token_ids_cfg
             # Normal batch (non-CFG)
             input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
                                    else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=False) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
             logits = self.run_model(input_ids, positions, is_prefill)
             reset_context()
+            if self.rank == 0:
+                # Apply repetition penalty to logits
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits.shape[1], dtype=torch.bool, device=logits.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits[i] < 0,
+                                    logits[i] * penalty,
+                                    logits[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits[i] = torch.where(token_mask, penalty_scores, logits[i])
+                # Prepare input_ids for sampler
+                seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
+                token_ids = self.sampler(
+                    logits,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    input_ids=seq_input_ids,
+                ).tolist()
+                return token_ids
+            else:
+                return None
     @torch.inference_mode()
     def capture_cudagraph(self):

acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py CHANGED Viewed

@@ -28,6 +28,9 @@ class Sequence:
         self.max_tokens = sampling_params.max_tokens
         self.ignore_eos = sampling_params.ignore_eos
         self.cfg_scale = sampling_params.cfg_scale
         # For CFG: mark if this is an unconditional sequence
         self.is_unconditional = is_unconditional
         # For CFG: reference to the corresponding conditional sequence (if this is unconditional)

         self.max_tokens = sampling_params.max_tokens
         self.ignore_eos = sampling_params.ignore_eos
         self.cfg_scale = sampling_params.cfg_scale
+        self.top_k = sampling_params.top_k
+        self.top_p = sampling_params.top_p
+        self.repetition_penalty = sampling_params.repetition_penalty
         # For CFG: mark if this is an unconditional sequence
         self.is_unconditional = is_unconditional
         # For CFG: reference to the corresponding conditional sequence (if this is unconditional)

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 from torch import nn
 class Sampler(nn.Module):
@@ -8,8 +9,66 @@ class Sampler(nn.Module):
         super().__init__()
     @torch.compile
-    def forward(self, logits: torch.Tensor, temperatures: torch.Tensor):
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
         probs = torch.softmax(logits, dim=-1)
         sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
         return sample_tokens

 import torch
 from torch import nn
+from typing import Optional
 class Sampler(nn.Module):
         super().__init__()
     @torch.compile
+    def forward(
+        self,
+        logits: torch.Tensor,
+        temperatures: torch.Tensor,
+        top_ks: Optional[torch.Tensor] = None,
+        top_ps: Optional[torch.Tensor] = None,
+        repetition_penalties: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
+        """
+        Sample tokens from logits with optional top-k, top-p, and repetition penalty.
+        Args:
+            logits: [batch_size, vocab_size] logits tensor
+            temperatures: [batch_size] temperature values
+            top_ks: Optional [batch_size] top-k values (None or 0 means no top-k filtering)
+            top_ps: Optional [batch_size] top-p values (None or 1.0 means no top-p filtering)
+            repetition_penalties: Optional [batch_size] repetition penalty values (1.0 means no penalty)
+            input_ids: Optional [batch_size, seq_len] input token ids for repetition penalty
+        """
+        batch_size, vocab_size = logits.shape
+        # Note: Repetition penalty is applied in ModelRunner before calling sampler
+        # This allows us to use the full sequence context
+        # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        # Apply top-k filtering if specified
+        if top_ks is not None:
+            for i in range(batch_size):
+                top_k = top_ks[i].item()
+                if top_k > 0 and top_k < vocab_size:
+                    # Get top-k logits, set others to -inf
+                    top_k_logits, top_k_indices = torch.topk(logits[i], int(top_k), dim=-1)
+                    filtered_logits = torch.full_like(logits[i], float('-inf'))
+                    filtered_logits[top_k_indices] = top_k_logits
+                    logits[i] = filtered_logits
+        # Apply top-p (nucleus) filtering if specified
+        if top_ps is not None:
+            probs = torch.softmax(logits, dim=-1)
+            for i in range(batch_size):
+                top_p = top_ps[i].item()
+                if 0.0 < top_p < 1.0:
+                    # Sort probabilities in descending order
+                    sorted_probs, sorted_indices = torch.sort(probs[i], descending=True)
+                    # Calculate cumulative probabilities
+                    cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
+                    # Find the cutoff point
+                    cutoff_idx = (cumsum_probs <= top_p).sum().item()
+                    if cutoff_idx < len(sorted_indices):
+                        cutoff_idx += 1  # Include one more token to ensure we have at least one
+                    # Create mask for tokens to keep
+                    mask = torch.zeros_like(probs[i])
+                    mask[sorted_indices[:cutoff_idx]] = 1.0
+                    # Apply mask: set filtered tokens to -inf
+                    logits[i] = torch.where(mask > 0, logits[i], torch.tensor(float('-inf'), device=logits.device))
+        # Sample using Gumbel-max trick (equivalent to sampling from softmax)
         probs = torch.softmax(logits, dim=-1)
         sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
         return sample_tokens

acestep/third_parts/nano-vllm/nanovllm/sampling_params.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
 @dataclass
@@ -7,7 +8,15 @@ class SamplingParams:
     max_tokens: int = 64
     ignore_eos: bool = False
     cfg_scale: float = 1.0  # CFG guidance scale. When > 1.0, applies classifier-free guidance
     def __post_init__(self):
         assert self.temperature > 1e-10, "greedy sampling is not permitted"
         assert self.cfg_scale >= 1.0, "cfg_scale must be >= 1.0"

 from dataclasses import dataclass
+from typing import Optional
 @dataclass
     max_tokens: int = 64
     ignore_eos: bool = False
     cfg_scale: float = 1.0  # CFG guidance scale. When > 1.0, applies classifier-free guidance
+    top_k: Optional[int] = None  # Top-k sampling: consider only top k tokens
+    top_p: Optional[float] = None  # Top-p (nucleus) sampling: consider tokens with cumulative probability <= top_p
+    repetition_penalty: float = 1.0  # Repetition penalty: >1.0 reduces repetition, <1.0 increases it
     def __post_init__(self):
         assert self.temperature > 1e-10, "greedy sampling is not permitted"
         assert self.cfg_scale >= 1.0, "cfg_scale must be >= 1.0"
+        if self.top_k is not None:
+            assert self.top_k > 0, "top_k must be > 0"
+        if self.top_p is not None:
+            assert 0.0 < self.top_p <= 1.0, "top_p must be in (0.0, 1.0]"
+        assert self.repetition_penalty > 0.0, "repetition_penalty must be > 0.0"