Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

xushengyuan commited on Dec 22, 2025

Commit

1bdda7b

1 Parent(s): 447806b

add 5hz llm test support & fix 5hz llm transformers inference

Browse files

Files changed (2) hide show

acestep/handler.py +47 -19
test.py +52 -6

acestep/handler.py CHANGED Viewed

@@ -20,7 +20,8 @@ from tqdm import tqdm
 from loguru import logger
 import warnings
-from transformers import AutoTokenizer, AutoModel
 from diffusers.models import AutoencoderOobleck
@@ -175,6 +176,8 @@ class AceStepHandler:
         try:
             if device == "auto":
                 device = "cuda" if torch.cuda.is_available() else "cpu"
             self.device = device
             self.offload_to_cpu = offload_to_cpu
@@ -203,7 +206,6 @@ class AceStepHandler:
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
-                        dtype=self.dtype,
                         attn_implementation=attn_implementation
                     )
                 except Exception as e:
@@ -214,7 +216,6 @@ class AceStepHandler:
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
                             trust_remote_code=True,
-                            dtype=self.dtype,
                             attn_implementation=attn_implementation
                         )
                     else:
@@ -299,8 +300,11 @@ class AceStepHandler:
                             # vllm initialization failed, fallback to PyTorch
                             if not self.llm_initialized:
                                 try:
-                                    self.llm = AutoModel.from_pretrained(full_lm_model_path)
-                                    self.llm = self.llm.to(device).to(self.dtype)
                                     self.llm.eval()
                                     self.llm_backend = "pt"
                                     self.llm_initialized = True
@@ -311,9 +315,12 @@ class AceStepHandler:
                     else:
                         # For CPU or other devices, use PyTorch backend
                         try:
-                            self.llm = AutoModel.from_pretrained(full_lm_model_path)
-                            self.llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
-                            self.llm = self.llm.to(device).to(self.dtype)
                             self.llm.eval()
                             self.llm_backend = "pt"
                             self.llm_initialized = True
@@ -328,7 +335,7 @@ class AceStepHandler:
             # Determine actual attention implementation used
             actual_attn = getattr(self.config, "_attn_implementation", "eager")
-            status_msg = f"✅ Model initialized successfully on {device}\n"
             status_msg += f"Main model: {acestep_v15_checkpoint_path}\n"
             status_msg += f"VAE: {vae_checkpoint_path}\n"
             status_msg += f"Text encoder: {text_encoder_path}\n"
@@ -581,22 +588,43 @@ class AceStepHandler:
                 padding=False,
                 truncation=True,
             )
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
             # Generate with the model
-            with torch.no_grad():
                 # Get max_new_tokens from model config or use a default
                 max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
                 if hasattr(self, 'max_model_len'):
                     max_new_tokens = min(max_new_tokens, self.max_model_len)
-                outputs = self.llm.generate(
-                    **inputs,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    do_sample=True if temperature > 0 else False,
-                    pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
-                )
             # Decode the generated tokens
             # Only decode the newly generated tokens (skip the input prompt)
@@ -776,7 +804,7 @@ class AceStepHandler:
             # Expand to include quantizer dimension: [1, T_5Hz, num_quantizers]
             if indices.dim() == 2:
                 indices = indices.unsqueeze(-1).expand(-1, -1, num_quantizers)
             # Get quantized representation from indices: [1, T_5Hz, dim]
             quantized = quantizer.get_output_from_indices(indices)
             if quantized.dtype != self.dtype:

 from loguru import logger
 import warnings
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
+from transformers.generation.streamers import BaseStreamer
 from diffusers.models import AutoencoderOobleck
         try:
             if device == "auto":
                 device = "cuda" if torch.cuda.is_available() else "cpu"
+            status_msg = ""
             self.device = device
             self.offload_to_cpu = offload_to_cpu
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
                         attn_implementation=attn_implementation
                     )
                 except Exception as e:
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
                             trust_remote_code=True,
                             attn_implementation=attn_implementation
                         )
                     else:
                             # vllm initialization failed, fallback to PyTorch
                             if not self.llm_initialized:
                                 try:
+                                    self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
+                                    if not self.offload_to_cpu:
+                                        self.llm = self.llm.to(device).to(self.dtype)
+                                    else:
+                                        self.llm = self.llm.to("cpu").to(self.dtype)
                                     self.llm.eval()
                                     self.llm_backend = "pt"
                                     self.llm_initialized = True
                     else:
                         # For CPU or other devices, use PyTorch backend
                         try:
+                            self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
+                            self.llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True, trust_remote_code=True)
+                            if not self.offload_to_cpu:
+                                self.llm = self.llm.to(device).to(self.dtype)
+                            else:
+                                self.llm = self.llm.to("cpu").to(self.dtype)
                             self.llm.eval()
                             self.llm_backend = "pt"
                             self.llm_initialized = True
             # Determine actual attention implementation used
             actual_attn = getattr(self.config, "_attn_implementation", "eager")
+            status_msg = f"✅ Model initialized successfully on {device}\n" + status_msg
             status_msg += f"Main model: {acestep_v15_checkpoint_path}\n"
             status_msg += f"VAE: {vae_checkpoint_path}\n"
             status_msg += f"Text encoder: {text_encoder_path}\n"
                 padding=False,
                 truncation=True,
             )
             # Generate with the model
+            with self._load_model_context("llm"):
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 # Get max_new_tokens from model config or use a default
                 max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
                 if hasattr(self, 'max_model_len'):
                     max_new_tokens = min(max_new_tokens, self.max_model_len)
+                # Define custom streamer for tqdm
+                class TqdmTokenStreamer(BaseStreamer):
+                    def __init__(self, total):
+                        self.pbar = tqdm(total=total, desc="Generating 5Hz tokens", unit="token", maxinterval=1)
+                    def put(self, value):
+                        # value is tensor of token ids
+                        if value.dim() > 1:
+                            num_tokens = value.numel()
+                        else:
+                            num_tokens = len(value)
+                        self.pbar.update(num_tokens)
+                    def end(self):
+                        self.pbar.close()
+                streamer = TqdmTokenStreamer(total=max_new_tokens)
+                with torch.no_grad():
+                    outputs = self.llm.generate(
+                        **inputs,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        do_sample=True if temperature > 0 else False,
+                        pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                        streamer=streamer,
+                    )
             # Decode the generated tokens
             # Only decode the newly generated tokens (skip the input prompt)
             # Expand to include quantizer dimension: [1, T_5Hz, num_quantizers]
             if indices.dim() == 2:
                 indices = indices.unsqueeze(-1).expand(-1, -1, num_quantizers)
+            print(indices.shape)
             # Get quantized representation from indices: [1, T_5Hz, dim]
             quantized = quantizer.get_output_from_indices(indices)
             if quantized.dtype != self.dtype:

test.py CHANGED Viewed

@@ -35,13 +35,15 @@ def main():
     device = "xpu"
     print(f"Using device: {device}")
     status, enabled = handler.initialize_service(
         project_root=project_root,
         config_path=model_name,
         device=device,
-        init_llm=True,
         use_flash_attention=False, # Default in UI
-        compile_model=False,
         offload_to_cpu=True,
         offload_dit_to_cpu=False, # Keep DiT on GPU
     )
@@ -95,6 +97,49 @@ def main():
     print("Starting generation...")
     # Reset peak memory stats
     if hasattr(torch, 'xpu') and torch.xpu.is_available():
         torch.xpu.reset_peak_memory_stats()
@@ -105,21 +150,22 @@ def main():
     results = handler.generate_music(
         captions=captions,
         lyrics=lyrics,
-        bpm=90,
-        key_scale="A major",
-        time_signature="4",
         vocal_language="zh",
         inference_steps=8,
         guidance_scale=7.0,
         use_random_seed=False,
         seed=seeds,
-        audio_duration=120,
         batch_size=1,
         task_type="text2music",
         cfg_interval_start=0.0,
         cfg_interval_end=0.95,
         audio_format="wav",
         use_tiled_decode=True,
     )
     # Unpack results

     device = "xpu"
     print(f"Using device: {device}")
+    use_llm = False
     status, enabled = handler.initialize_service(
         project_root=project_root,
         config_path=model_name,
         device=device,
+        init_llm=use_llm,
         use_flash_attention=False, # Default in UI
+        compile_model=True,
         offload_to_cpu=True,
         offload_dit_to_cpu=False, # Keep DiT on GPU
     )
     print("Starting generation...")
+    # Generate hints using 5Hz LLM
+    if use_llm:
+        print("Generating hints using 5Hz LLM...")
+        lm_temperature = 0.6
+        metadata, audio_codes, lm_status = handler.generate_with_5hz_lm(captions, lyrics, lm_temperature)
+        print(f"5Hz LLM Status: {lm_status}")
+        print(f"Generated Metadata: {metadata}")
+        print(f"Generated Audio Codes (first 50 chars): {audio_codes[:50]}...")
+    else:
+        print("Skipping 5Hz LLM generation...")
+        metadata = {}
+        audio_codes = None
+        lm_status = "Skipped"
+    # Use generated metadata if available
+    bpm = metadata.get('bpm', 90)
+    if bpm == "N/A" or bpm == "":
+        bpm = 90
+    else:
+        try:
+            bpm = int(float(bpm))
+        except:
+            bpm = 90
+    key_scale = metadata.get('keyscale', metadata.get('key_scale', "A major"))
+    if key_scale == "N/A":
+        key_scale = "A major"
+    time_signature = metadata.get('timesignature', metadata.get('time_signature', "4"))
+    if time_signature == "N/A":
+        time_signature = "4"
+    audio_duration = metadata.get('duration', 120)
+    if audio_duration == "N/A":
+        audio_duration = 120
+    else:
+        try:
+            audio_duration = float(audio_duration)
+        except:
+            audio_duration = 120
+    print(f"Using parameters: BPM={bpm}, Key={key_scale}, Time Sig={time_signature}, Duration={audio_duration}")
     # Reset peak memory stats
     if hasattr(torch, 'xpu') and torch.xpu.is_available():
         torch.xpu.reset_peak_memory_stats()
     results = handler.generate_music(
         captions=captions,
         lyrics=lyrics,
+        bpm=bpm,
+        key_scale=key_scale,
+        time_signature=time_signature,
         vocal_language="zh",
         inference_steps=8,
         guidance_scale=7.0,
         use_random_seed=False,
         seed=seeds,
+        audio_duration=audio_duration,
         batch_size=1,
         task_type="text2music",
         cfg_interval_start=0.0,
         cfg_interval_end=0.95,
         audio_format="wav",
         use_tiled_decode=True,
+        audio_code_string=audio_codes,
     )
     # Unpack results