Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Dec 22, 2025

Commit

25e1b40

1 Parent(s): 1e1527a

add 5hz llm

Browse files

Files changed (6) hide show

acestep/gradio_ui.py +38 -6
acestep/handler.py +174 -30
acestep/third_parts/nano-vllm/nanovllm/config.py +32 -1
acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py +23 -2
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +58 -4
requirements.txt +2 -1

acestep/gradio_ui.py CHANGED Viewed

@@ -204,7 +204,6 @@ def create_generation_section(handler) -> dict:
                     label="Initialize 5Hz LM",
                     value=False,
                     info="Check to initialize 5Hz LM during service initialization",
-                    interactive=False
                 )
             with gr.Row():
@@ -298,7 +297,7 @@ def create_generation_section(handler) -> dict:
                         )
                     # 5Hz LM
-                    with gr.Row(visible=False) as use_5hz_lm_row:
                         use_5hz_lm_btn = gr.Button(
                             "Generate LM Hints",
                             variant="secondary",
@@ -748,9 +747,36 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
     def generate_lm_hints_wrapper(caption, lyrics, temperature):
         """Wrapper for 5Hz LM generation"""
         metadata, audio_codes, status = handler.generate_with_5hz_lm(caption, lyrics, temperature)
-        # 返回格式化的结果，可以根据需要调整
-        result_text = f"Status: {status}\n\nMetadata: {metadata}\n\nAudio Codes: {audio_codes[:200]}..." if len(audio_codes) > 200 else f"Status: {status}\n\nMetadata: {metadata}\n\nAudio Codes: {audio_codes}"
-        return result_text
     generation_section["use_5hz_lm_btn"].click(
         fn=generate_lm_hints_wrapper,
@@ -759,7 +785,13 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
             generation_section["lyrics"],
             generation_section["lm_temperature"]
         ],
-        outputs=[generation_section["text2music_audio_code_string"]]
     )
     # Update instruction and UI visibility based on task type

                     label="Initialize 5Hz LM",
                     value=False,
                     info="Check to initialize 5Hz LM during service initialization",
                 )
             with gr.Row():
                         )
                     # 5Hz LM
+                    with gr.Row(visible=True) as use_5hz_lm_row:
                         use_5hz_lm_btn = gr.Button(
                             "Generate LM Hints",
                             variant="secondary",
     def generate_lm_hints_wrapper(caption, lyrics, temperature):
         """Wrapper for 5Hz LM generation"""
         metadata, audio_codes, status = handler.generate_with_5hz_lm(caption, lyrics, temperature)
+        # Extract metadata values and map to UI fields
+        # Handle bpm
+        bpm_value = metadata.get('bpm', None)
+        if bpm_value == "N/A" or bpm_value == "":
+            bpm_value = None
+        # Handle key_scale (metadata uses 'keyscale')
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value == "N/A":
+            key_scale_value = ""
+        # Handle time_signature (metadata uses 'timesignature')
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value == "N/A":
+            time_signature_value = ""
+        # Handle audio_duration (metadata uses 'duration')
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value == "N/A" or audio_duration_value == "":
+            audio_duration_value = -1
+        # Return audio codes and all metadata fields
+        return (
+            audio_codes,  # text2music_audio_code_string
+            bpm_value,    # bpm
+            key_scale_value,  # key_scale
+            time_signature_value,  # time_signature
+            audio_duration_value,  # audio_duration
+        )
     generation_section["use_5hz_lm_btn"].click(
         fn=generate_lm_hints_wrapper,
             generation_section["lyrics"],
             generation_section["lm_temperature"]
         ],
+        outputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+        ]
     )
     # Update instruction and UI visibility based on task type

acestep/handler.py CHANGED Viewed

@@ -4,6 +4,7 @@ Encapsulates all data processing and business logic as a bridge between model an
 """
 import os
 import math
 import tempfile
 import traceback
 import re
@@ -58,9 +59,10 @@ class AceStepHandler:
         self.sample_rate = 48000
         # 5Hz LM related
-        self.lm_model = None
-        self.lm_tokenizer = None
-        self.lm_initialized = False
         # Reward model (temporarily disabled)
         self.reward_model = None
@@ -218,12 +220,43 @@ class AceStepHandler:
             if init_llm:
                 full_lm_model_path = os.path.join(checkpoint_dir, lm_model_path)
                 if os.path.exists(full_lm_model_path):
                     if device == "cuda":
                         status_msg = self._initialize_5hz_lm_cuda(full_lm_model_path)
-                        if not self.llm_initialized:
-                            return status_msg, False
-                    self.llm = AutoModel.from_pretrained(full_lm_model_path)
-                    self.llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path)
                 else:
                     # 5Hz LM path not found
                     return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
@@ -266,6 +299,10 @@ class AceStepHandler:
             reserved_mem_bytes = torch.cuda.memory_reserved(device)
             total_gpu = total_gpu_mem_bytes / 1024**3
             allocated_gpu = allocated_mem_bytes / 1024**3
             reserved_gpu = reserved_mem_bytes / 1024**3
             available_gpu = total_gpu - reserved_gpu
@@ -275,54 +312,64 @@ class AceStepHandler:
             else:
                 ratio = min(max_ratio, max(min_ratio, (available_gpu * 0.8) / total_gpu))
-            return ratio
         except Exception as e:
-            return 0.9
     def _initialize_5hz_lm_cuda(self, model_path: str) -> str:
         """Initialize 5Hz LM model"""
         try:
             from nanovllm import LLM, SamplingParams
-            if not torch.cuda.is_available():
-                return "❌ CUDA is not available. Please check your GPU setup."
             current_device = torch.cuda.current_device()
             device_name = torch.cuda.get_device_name(current_device)
             torch.cuda.empty_cache()
-            gpu_memory_utilization = self.get_gpu_memory_utilization(
                 minimal_gpu=8,
                 min_ratio=0.2,
                 max_ratio=0.9
             )
             self.llm = LLM(
                 model=model_path,
                 enforce_eager=False,
                 tensor_parallel_size=1,
-                max_model_len=4096,
                 gpu_memory_utilization=gpu_memory_utilization,
             )
-            self.llm_tokenizer = self.llm.tokenizer
             self.llm_initialized = True
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
             error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg
-    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
-        """Generate metadata and audio codes using 5Hz LM"""
-        if not self.lm_initialized or self.llm is None:
-            return {}, "", "❌ 5Hz LM not initialized. Please initialize it first."
         try:
             from nanovllm import SamplingParams
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
-            formatted_prompt = self.lm_tokenizer.apply_chat_template(
                 [
                     {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
                     {"role": "user", "content": prompt}
@@ -330,10 +377,10 @@ class AceStepHandler:
                 tokenize=False,
                 add_generation_prompt=True,
             )
-            sampling_params = SamplingParams(max_tokens=3072, temperature=temperature)
             outputs = self.llm.generate([formatted_prompt], sampling_params)
             if isinstance(outputs, list) and len(outputs) > 0:
                 if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
                     output_text = outputs[0].outputs[0].text
@@ -351,22 +398,113 @@ class AceStepHandler:
         except Exception as e:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
     def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
-        """Parse LM output"""
         metadata = {}
         audio_codes = ""
         import re
-        # Extract audio codes
         code_pattern = r'<\|audio_code_\d+\|>'
         code_matches = re.findall(code_pattern, output_text)
         if code_matches:
             audio_codes = "".join(code_matches)
-        # Extract metadata
         reasoning_patterns = [
             r'<think>(.*?)</think>',
             r'<reasoning>(.*?)</reasoning>',
         ]
@@ -378,7 +516,9 @@ class AceStepHandler:
                 reasoning_text = match.group(1).strip()
                 break
         if not reasoning_text:
             lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
             reasoning_text = lines_before_codes.strip()
@@ -402,8 +542,12 @@ class AceStepHandler:
                                 metadata['duration'] = int(value)
                             except:
                                 metadata['duration'] = value
-                        elif key in ['genres', 'keyscale', 'timesignature']:
-                            metadata[key] = value
         return metadata, audio_codes

 """
 import os
 import math
+from copy import deepcopy
 import tempfile
 import traceback
 import re
         self.sample_rate = 48000
         # 5Hz LM related
+        self.llm = None
+        self.llm_tokenizer = None
+        self.llm_initialized = False
+        self.llm_backend = None
         # Reward model (temporarily disabled)
         self.reward_model = None
             if init_llm:
                 full_lm_model_path = os.path.join(checkpoint_dir, lm_model_path)
                 if os.path.exists(full_lm_model_path):
+                    print("loading 5Hz LM tokenizer...")
+                    start_time = time.time()
+                    llm_tokenizer = deepcopy(self.text_tokenizer)
+                    max_audio_length = 2**16 - 1
+                    semantic_tokens = [f"<|audio_code_{i}|>" for i in range(max_audio_length)]
+                    # 217204
+                    llm_tokenizer.add_special_tokens({"additional_special_tokens": semantic_tokens})
+                    print(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
+                    self.llm_tokenizer = llm_tokenizer
                     if device == "cuda":
                         status_msg = self._initialize_5hz_lm_cuda(full_lm_model_path)
+                        print(f"5Hz LM status message: {status_msg}")
+                        # Check if initialization failed (status_msg starts with ❌)
+                        if status_msg.startswith("❌"):
+                            # vllm initialization failed, fallback to PyTorch
+                            if not self.llm_initialized:
+                                try:
+                                    self.llm = AutoModel.from_pretrained(full_lm_model_path)
+                                    self.llm = self.llm.to(device).to(self.dtype)
+                                    self.llm.eval()
+                                    self.llm_backend = "pt"
+                                    self.llm_initialized = True
+                                except Exception as e:
+                                    return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
+                        # If vllm initialization succeeded, self.llm_initialized should already be True
+                    else:
+                        # For CPU or other devices, use PyTorch backend
+                        try:
+                            self.llm = AutoModel.from_pretrained(full_lm_model_path)
+                            self.llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
+                            self.llm = self.llm.to(device).to(self.dtype)
+                            self.llm.eval()
+                            self.llm_backend = "pt"
+                            self.llm_initialized = True
+                        except Exception as e:
+                            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
                 else:
                     # 5Hz LM path not found
                     return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
             reserved_mem_bytes = torch.cuda.memory_reserved(device)
             total_gpu = total_gpu_mem_bytes / 1024**3
+            low_gpu_memory_mode = False
+            if total_gpu < minimal_gpu:
+                minimal_gpu = 0.5 * total_gpu
+                low_gpu_memory_mode = True
             allocated_gpu = allocated_mem_bytes / 1024**3
             reserved_gpu = reserved_mem_bytes / 1024**3
             available_gpu = total_gpu - reserved_gpu
             else:
                 ratio = min(max_ratio, max(min_ratio, (available_gpu * 0.8) / total_gpu))
+            return ratio, low_gpu_memory_mode
         except Exception as e:
+            return 0.9, low_gpu_memory_mode
     def _initialize_5hz_lm_cuda(self, model_path: str) -> str:
         """Initialize 5Hz LM model"""
+        if not torch.cuda.is_available():
+            self.llm_initialized = False
+            print("CUDA is not available. Please check your GPU setup.")
+            return "❌ CUDA is not available. Please check your GPU setup."
         try:
             from nanovllm import LLM, SamplingParams
+        except ImportError:
+            self.llm_initialized = False
+            print("nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install .")
+            return "❌ nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install ."
+        try:
             current_device = torch.cuda.current_device()
             device_name = torch.cuda.get_device_name(current_device)
             torch.cuda.empty_cache()
+            gpu_memory_utilization, low_gpu_memory_mode = self.get_gpu_memory_utilization(
                 minimal_gpu=8,
                 min_ratio=0.2,
                 max_ratio=0.9
             )
+            if low_gpu_memory_mode:
+                self.max_model_len = 1024
+            else:
+                self.max_model_len = 2048
+            print(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
+            start_time = time.time()
             self.llm = LLM(
                 model=model_path,
                 enforce_eager=False,
                 tensor_parallel_size=1,
+                max_model_len=self.max_model_len,
                 gpu_memory_utilization=gpu_memory_utilization,
             )
+            print(f"5Hz LM initialized successfully in {time.time() - start_time:.2f} seconds")
+            self.llm.tokenizer = self.llm_tokenizer
             self.llm_initialized = True
+            self.llm_backend = "vllm"
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
             error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg
+    def generate_with_5hz_lm_vllm(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
         try:
             from nanovllm import SamplingParams
             prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+            formatted_prompt = self.llm_tokenizer.apply_chat_template(
                 [
                     {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
                     {"role": "user", "content": prompt}
                 tokenize=False,
                 add_generation_prompt=True,
             )
+            print("[debug] formatted_prompt: ", formatted_prompt)
+            sampling_params = SamplingParams(max_tokens=self.max_model_len, temperature=temperature)
             outputs = self.llm.generate([formatted_prompt], sampling_params)
             if isinstance(outputs, list) and len(outputs) > 0:
                 if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
                     output_text = outputs[0].outputs[0].text
         except Exception as e:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
+    def generate_with_5hz_lm_pt(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
+        try:
+            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+            formatted_prompt = self.llm_tokenizer.apply_chat_template(
+                [
+                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                    {"role": "user", "content": prompt}
+                ],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            # Tokenize the prompt
+            inputs = self.llm_tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Generate with the model
+            with torch.no_grad():
+                # Get max_new_tokens from model config or use a default
+                max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
+                if hasattr(self, 'max_model_len'):
+                    max_new_tokens = min(max_new_tokens, self.max_model_len)
+                outputs = self.llm.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    do_sample=True if temperature > 0 else False,
+                    pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                )
+            # Decode the generated tokens
+            # Only decode the newly generated tokens (skip the input prompt)
+            generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+            output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
+            metadata, audio_codes = self.parse_lm_output(output_text)
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
+        except Exception as e:
+            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return {}, "", error_msg
+    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
+        """Generate metadata and audio codes using 5Hz LM"""
+        # Check if 5Hz LM is initialized
+        if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
+            debug_info = f"llm_initialized={getattr(self, 'llm_initialized', 'not set')}, "
+            debug_info += f"has_llm={hasattr(self, 'llm')}, "
+            debug_info += f"llm_is_none={getattr(self, 'llm', None) is None}, "
+            debug_info += f"llm_backend={getattr(self, 'llm_backend', 'not set')}"
+            return {}, "", f"❌ 5Hz LM not initialized. Please initialize it first. Debug: {debug_info}"
+        if not hasattr(self, 'llm') or self.llm is None:
+            return {}, "", "❌ 5Hz LM model not loaded. Please initialize it first."
+        if not hasattr(self, 'llm_backend'):
+            return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
+        if self.llm_backend == "vllm":
+            return self.generate_with_5hz_lm_vllm(caption, lyrics, temperature)
+        else:
+            return self.generate_with_5hz_lm_pt(caption, lyrics, temperature)
     def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
+        """
+        Parse LM output to extract metadata and audio codes.
+        Expected format:
+        <think>
+        bpm: 73
+        duration: 273
+        genres: Chinese folk
+        keyscale: G major
+        timesignature: 4
+        </think>
+        <|audio_code_56535|><|audio_code_62918|>...
+        Returns:
+            Tuple of (metadata_dict, audio_codes_string)
+        """
+        debug_output_text = output_text.split("</think>")[0]
+        print(f"Debug output text: {debug_output_text}")
         metadata = {}
         audio_codes = ""
         import re
+        # Extract audio codes - find all <|audio_code_XXX|> patterns
         code_pattern = r'<\|audio_code_\d+\|>'
         code_matches = re.findall(code_pattern, output_text)
         if code_matches:
             audio_codes = "".join(code_matches)
+        # Extract metadata from reasoning section
+        # Try different reasoning tag patterns
         reasoning_patterns = [
+            r'<think>(.*?)</think>',
             r'<think>(.*?)</think>',
             r'<reasoning>(.*?)</reasoning>',
         ]
                 reasoning_text = match.group(1).strip()
                 break
+        # If no reasoning tags found, try to parse metadata from the beginning of output
         if not reasoning_text:
+            # Look for metadata lines before audio codes
             lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
             reasoning_text = lines_before_codes.strip()
                                 metadata['duration'] = int(value)
                             except:
                                 metadata['duration'] = value
+                        elif key == 'genres':
+                            metadata['genres'] = value
+                        elif key == 'keyscale':
+                            metadata['keyscale'] = value
+                        elif key == 'timesignature':
+                            metadata['timesignature'] = value
         return metadata, audio_codes

acestep/third_parts/nano-vllm/nanovllm/config.py CHANGED Viewed

@@ -1,8 +1,35 @@
 import os
 from dataclasses import dataclass
 from transformers import AutoConfig
 @dataclass
 class Config:
     model: str
@@ -13,9 +40,10 @@ class Config:
     tensor_parallel_size: int = 1
     enforce_eager: bool = False
     hf_config: AutoConfig | None = None
-    eos: int = -1
     kvcache_block_size: int = 256
     num_kvcache_blocks: int = -1
     def __post_init__(self):
         assert os.path.isdir(self.model)
@@ -24,3 +52,6 @@ class Config:
         self.hf_config = AutoConfig.from_pretrained(self.model)
         self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
         assert self.max_num_batched_tokens >= self.max_model_len

 import os
+import socket
 from dataclasses import dataclass
 from transformers import AutoConfig
+def find_available_port(start_port: int = 2333, max_attempts: int = 100) -> int:
+    """Find an available port starting from start_port.
+    Args:
+        start_port: The starting port number to check
+        max_attempts: Maximum number of ports to try
+    Returns:
+        An available port number
+    Raises:
+        RuntimeError: If no available port is found within max_attempts
+    """
+    for i in range(max_attempts):
+        port = start_port + i
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                s.bind(('localhost', port))
+                return port
+        except OSError:
+            # Port is in use, try next one
+            continue
+    raise RuntimeError(f"Could not find an available port starting from {start_port} after {max_attempts} attempts")
 @dataclass
 class Config:
     model: str
     tensor_parallel_size: int = 1
     enforce_eager: bool = False
     hf_config: AutoConfig | None = None
+    eos: int = 151643
     kvcache_block_size: int = 256
     num_kvcache_blocks: int = -1
+    dist_port: int | None = None
     def __post_init__(self):
         assert os.path.isdir(self.model)
         self.hf_config = AutoConfig.from_pretrained(self.model)
         self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
         assert self.max_num_batched_tokens >= self.max_model_len
+        # Auto-find available port if not specified
+        if self.dist_port is None:
+            self.dist_port = find_available_port()

acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py CHANGED Viewed

@@ -21,6 +21,28 @@ class LLMEngine:
         self.ps = []
         self.events = []
         ctx = mp.get_context("spawn")
         for i in range(1, config.tensor_parallel_size):
             event = ctx.Event()
             process = ctx.Process(target=ModelRunner, args=(config, i, event))
@@ -28,8 +50,7 @@ class LLMEngine:
             self.ps.append(process)
             self.events.append(event)
         self.model_runner = ModelRunner(config, 0, self.events)
-        self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
-        config.eos = self.tokenizer.eos_token_id
         self.scheduler = Scheduler(config)
         atexit.register(self.exit)

         self.ps = []
         self.events = []
         ctx = mp.get_context("spawn")
+        # Pre-validate port availability by attempting to bind to it
+        # This helps avoid race conditions when multiple LLMEngine instances start simultaneously
+        import socket
+        from nanovllm.config import find_available_port
+        max_port_retries = 10
+        for port_attempt in range(max_port_retries):
+            try:
+                # Test if port is actually available by binding to it
+                test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                test_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                test_socket.bind(('localhost', config.dist_port))
+                test_socket.close()
+                # Port is available, break
+                break
+            except OSError:
+                # Port is in use, find next available
+                if port_attempt < max_port_retries - 1:
+                    config.dist_port = find_available_port(start_port=config.dist_port + 1, max_attempts=10)
+                else:
+                    raise RuntimeError(f"Failed to find available port after {max_port_retries} attempts")
         for i in range(1, config.tensor_parallel_size):
             event = ctx.Event()
             process = ctx.Process(target=ModelRunner, args=(config, i, event))
             self.ps.append(process)
             self.events.append(event)
         self.model_runner = ModelRunner(config, 0, self.events)
+        self.tokenizer = None
         self.scheduler = Scheduler(config)
         atexit.register(self.exit)

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import pickle
 import torch
 import torch.distributed as dist
 from multiprocessing.synchronize import Event
 from multiprocessing.shared_memory import SharedMemory
-from nanovllm.config import Config
 from nanovllm.engine.sequence import Sequence
 from nanovllm.models.qwen3 import Qwen3ForCausalLM
 from nanovllm.layers.sampler import Sampler
@@ -23,7 +24,32 @@ class ModelRunner:
         self.rank = rank
         self.event = event
-        dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
         torch.cuda.set_device(rank)
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(hf_config.torch_dtype)
@@ -118,9 +144,15 @@ class ModelRunner:
                 layer_id += 1
     def prepare_block_tables(self, seqs: list[Sequence]):
-        max_len = max(len(seq.block_table) for seq in seqs)
         block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
         block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
         return block_tables
     def prepare_prefill(self, seqs: list[Sequence]):
@@ -215,7 +247,29 @@ class ModelRunner:
             graph_vars["slot_mapping"][:bs] = context.slot_mapping
             graph_vars["context_lens"].zero_()
             graph_vars["context_lens"][:bs] = context.context_lens
-            graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
             graph.replay()
             return self.model.compute_logits(graph_vars["outputs"][:bs])

 import pickle
+import socket
 import torch
 import torch.distributed as dist
 from multiprocessing.synchronize import Event
 from multiprocessing.shared_memory import SharedMemory
+from nanovllm.config import Config, find_available_port
 from nanovllm.engine.sequence import Sequence
 from nanovllm.models.qwen3 import Qwen3ForCausalLM
 from nanovllm.layers.sampler import Sampler
         self.rank = rank
         self.event = event
+        # Try to initialize process group with retry logic for port conflicts
+        # Only rank 0 binds to the port, so only rank 0 needs retry logic
+        dist_port = self.config.dist_port
+        max_retries = 10
+        for attempt in range(max_retries):
+            try:
+                dist.init_process_group("nccl", f"tcp://localhost:{dist_port}", world_size=self.world_size, rank=rank)
+                break
+            except RuntimeError as e:
+                if ("EADDRINUSE" in str(e) or "address already in use" in str(e).lower()) and rank == 0:
+                    # Port is in use, try next port (only for rank 0)
+                    if attempt < max_retries - 1:
+                        # Find next available port
+                        dist_port = find_available_port(start_port=dist_port + 1, max_attempts=10)
+                        self.config.dist_port = dist_port
+                        # If we had a previous failed attempt, destroy any partial process group
+                        if dist.is_initialized():
+                            try:
+                                dist.destroy_process_group()
+                            except:
+                                pass
+                    else:
+                        raise RuntimeError(f"Failed to find available port after {max_retries} attempts. Last error: {e}")
+                else:
+                    # Other error or non-rank-0 process, re-raise
+                    raise
         torch.cuda.set_device(rank)
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(hf_config.torch_dtype)
                 layer_id += 1
     def prepare_block_tables(self, seqs: list[Sequence]):
+        max_len = max(len(seq.block_table) for seq in seqs) if seqs else 0
+        if max_len == 0:
+            # Return empty 2D tensor with correct shape
+            return torch.zeros((len(seqs), 0), dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
         block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
         block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        # Ensure it's 2D: if only one sequence, shape should be [1, max_len]
+        if block_tables.dim() == 1:
+            block_tables = block_tables.unsqueeze(0)
         return block_tables
     def prepare_prefill(self, seqs: list[Sequence]):
             graph_vars["slot_mapping"][:bs] = context.slot_mapping
             graph_vars["context_lens"].zero_()
             graph_vars["context_lens"][:bs] = context.context_lens
+            # Handle block_tables: ensure it's 2D and size matches
+            if context.block_tables is not None and context.block_tables.numel() > 0:
+                # Ensure block_tables is 2D
+                if context.block_tables.dim() == 1:
+                    # Reshape 1D to 2D: [num_blocks] -> [1, num_blocks]
+                    block_tables_2d = context.block_tables.unsqueeze(0)
+                else:
+                    block_tables_2d = context.block_tables
+                # Get dimensions
+                context_bs = block_tables_2d.size(0)
+                context_num_blocks = block_tables_2d.size(1)
+                graph_num_blocks = graph_vars["block_tables"].size(1)
+                # Use minimum to avoid size mismatch
+                num_blocks_to_copy = min(context_num_blocks, graph_num_blocks)
+                actual_bs = min(bs, context_bs)
+                # Copy block_tables with size matching
+                graph_vars["block_tables"][:actual_bs, :num_blocks_to_copy] = block_tables_2d[:actual_bs, :num_blocks_to_copy]
+                # Fill remaining with -1 if needed
+                if num_blocks_to_copy < graph_num_blocks:
+                    graph_vars["block_tables"][:actual_bs, num_blocks_to_copy:] = -1
             graph.replay()
             return self.model.compute_logits(graph_vars["outputs"][:bs])

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ diffusers
 gradio
 soundfile
 loguru
-einops

 gradio
 soundfile
 loguru
+einops
+accelerator