Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on 15 days ago

Commit

76de6b9

1 Parent(s): e19bc36

fix duration

Browse files

Files changed (5) hide show

acestep/gradio_ui/interfaces/generation.py +1 -1
acestep/inference.py +1 -0
acestep/llm_inference.py +36 -2
acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py +7 -0
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +20 -6

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -364,7 +364,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     minimum=-1,
                     maximum=600.0,
                     step=1,
-                    info="Use -1 for random",
                     scale=1,
                 )
                 vocal_language = gr.Dropdown(

                     minimum=-1,
                     maximum=600.0,
                     step=1,
+                    info="Use -1 for auto, or 10-600 seconds",
                     scale=1,
                 )
                 vocal_language = gr.Dropdown(

acestep/inference.py CHANGED Viewed

@@ -447,6 +447,7 @@ def generate_music(
                     negative_prompt=params.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
                     user_metadata=user_metadata_to_pass,
                     use_cot_caption=params.use_cot_caption,
                     use_cot_language=params.use_cot_language,

                     negative_prompt=params.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
+                    target_duration=audio_duration,  # Pass duration to limit audio codes generation
                     user_metadata=user_metadata_to_pass,
                     use_cot_caption=params.use_cot_caption,
                     use_cot_language=params.use_cot_language,

acestep/llm_inference.py CHANGED Viewed

@@ -474,8 +474,20 @@ class LLMHandler:
             codes_temperature=codes_temperature,
         )
         sampling_params = SamplingParams(
-            max_tokens=self.max_model_len - 64,
             temperature=effective_sampler_temp,
             cfg_scale=cfg_scale,
             top_k=top_k,
@@ -566,7 +578,17 @@ class LLMHandler:
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            max_new_tokens = getattr(self.llm.config, "max_new_tokens", 4096)
             if hasattr(self, "max_model_len"):
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
@@ -1927,6 +1949,18 @@ class LLMHandler:
             return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
         except Exception as e:
             return "", f"❌ Error generating from formatted prompt: {e}"
     def _generate_with_constrained_decoding(

             codes_temperature=codes_temperature,
         )
+        # Calculate max_tokens based on target_duration if specified
+        # 5 audio codes = 1 second, plus ~500 tokens for CoT metadata and safety margin
+        if target_duration is not None and target_duration > 0:
+            # Ensure duration is within valid range (10-600 seconds)
+            effective_duration = max(10, min(600, target_duration))
+            max_tokens = int(effective_duration * 5) + 500
+            # Cap at model's max length
+            max_tokens = min(max_tokens, self.max_model_len - 64)
+        else:
+            # No duration constraint - use default (model will stop at EOS naturally)
+            max_tokens = self.max_model_len - 64
         sampling_params = SamplingParams(
+            max_tokens=max_tokens,
             temperature=effective_sampler_temp,
             cfg_scale=cfg_scale,
             top_k=top_k,
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Calculate max_new_tokens based on target_duration if specified
+            # 5 audio codes = 1 second, plus ~500 tokens for CoT metadata and safety margin
+            if target_duration is not None and target_duration > 0:
+                # Ensure duration is within valid range (10-600 seconds)
+                effective_duration = max(10, min(600, target_duration))
+                max_new_tokens = int(effective_duration * 5) + 500
+            else:
+                max_new_tokens = getattr(self.llm.config, "max_new_tokens", 4096)
+            # Cap at model's max length
             if hasattr(self, "max_model_len"):
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
             return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
         except Exception as e:
+            # Reset nano-vllm state on error to prevent stale context from causing
+            # subsequent CUDA illegal memory access errors
+            if self.llm_backend == "vllm":
+                try:
+                    from nanovllm.utils.context import reset_context
+                    reset_context()
+                except ImportError:
+                    pass
+            # Clear CUDA cache to release any corrupted memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
             return "", f"❌ Error generating from formatted prompt: {e}"
     def _generate_with_constrained_decoding(

acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py CHANGED Viewed

@@ -86,6 +86,13 @@ class BlockManager:
             block = self.blocks[block_id]
             block.ref_count -= 1
             if block.ref_count == 0:
                 self._deallocate_block(block_id)
         seq.num_cached_tokens = 0
         seq.block_table.clear()

             block = self.blocks[block_id]
             block.ref_count -= 1
             if block.ref_count == 0:
+                # Fix: Clean up hash_to_block_id mapping to prevent stale references
+                # This prevents CUDA illegal memory access when prefix cache tries to
+                # reuse a block_id that has already been freed
+                if block.hash != -1:
+                    cached_id = self.hash_to_block_id.get(block.hash)
+                    if cached_id == block_id:
+                        del self.hash_to_block_id[block.hash]
                 self._deallocate_block(block_id)
         seq.num_cached_tokens = 0
         seq.block_table.clear()

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -325,6 +325,17 @@ class ModelRunner:
                 # Fall back to eager mode when block_tables is too large for CUDA graph
                 return self.model.compute_logits(self.model(input_ids, positions))
             graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
             graph_vars = self.graph_vars
             graph_vars["input_ids"][:bs] = input_ids
@@ -416,9 +427,10 @@ class ModelRunner:
                 ).tolist()
                 # Update logits processor state after sampling
-                for i, seq in enumerate(cond_seqs):
-                    if seq.logits_processor_update_state is not None:
-                        seq.logits_processor_update_state(token_ids_cfg[i])
                 # Return token_ids (will be applied to both conditional and unconditional sequences)
                 return token_ids_cfg
@@ -483,9 +495,11 @@ class ModelRunner:
                 ).tolist()
                 # Update logits processor state after sampling
-                for i, seq in enumerate(seqs):
-                    if seq.logits_processor_update_state is not None:
-                        seq.logits_processor_update_state(token_ids[i])
                 return token_ids
             else:

                 # Fall back to eager mode when block_tables is too large for CUDA graph
                 return self.model.compute_logits(self.model(input_ids, positions))
+            # Fix: Also check if block_tables row count matches batch size
+            # Dimension mismatch can cause CUDA illegal memory access during graph replay
+            if context.block_tables.size(0) != bs:
+                # Fall back to eager mode when block_tables row count doesn't match batch size
+                return self.model.compute_logits(self.model(input_ids, positions))
+            # Fix: Verify slot_mapping and context_lens dimensions match batch size
+            if context.slot_mapping.size(0) != bs or context.context_lens.size(0) != bs:
+                # Fall back to eager mode when dimensions don't match
+                return self.model.compute_logits(self.model(input_ids, positions))
             graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
             graph_vars = self.graph_vars
             graph_vars["input_ids"][:bs] = input_ids
                 ).tolist()
                 # Update logits processor state after sampling
+                # NOTE: Only update for the first sequence since all sequences share the same processor
+                # Updating multiple times would cause duplicate state updates (e.g., codes_count += N instead of += 1)
+                if cond_seqs and cond_seqs[0].logits_processor_update_state is not None:
+                    cond_seqs[0].logits_processor_update_state(token_ids_cfg[0])
                 # Return token_ids (will be applied to both conditional and unconditional sequences)
                 return token_ids_cfg
                 ).tolist()
                 # Update logits processor state after sampling
+                # NOTE: Only update for the first sequence since all sequences may share the same processor
+                # (when using a single SamplingParams for batch generation)
+                # Updating multiple times would cause duplicate state updates (e.g., codes_count += N instead of += 1)
+                if seqs and seqs[0].logits_processor_update_state is not None:
+                    seqs[0].logits_processor_update_state(token_ids[0])
                 return token_ids
             else: