Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on Jan 7

Commit

a161649

1 Parent(s): 4bcd037

fix bugs

Browse files

Files changed (4) hide show

acestep/constrained_logits_processor.py +398 -15
acestep/gradio_ui.py +362 -17
acestep/llm_inference.py +60 -1
acestep/test_time_scaling.py +261 -0

acestep/constrained_logits_processor.py CHANGED Viewed

@@ -35,6 +35,9 @@ class FSMState(Enum):
     DURATION_NAME = auto()       # Generating "duration: "
     DURATION_VALUE = auto()      # Generating numeric value 10-600
     NEWLINE_AFTER_DURATION = auto()
     KEYSCALE_NAME = auto()       # Generating "keyscale: "
     KEYSCALE_VALUE = auto()      # Generating keyscale pattern
     NEWLINE_AFTER_KEYSCALE = auto()
@@ -74,7 +77,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         tokenizer: AutoTokenizer,
         enabled: bool = True,
         debug: bool = False,
-        **kwargs: Any,
     ):
         """
         Initialize the constrained logits processor.
@@ -89,6 +93,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         self.tokenizer = tokenizer
         self.enabled = enabled
         self.debug = debug
         self.skip_caption = False  # Set to True to skip caption field generation
         self.skip_language = False  # Set to True to skip language field generation
         self.caption: Optional[str] = None  # Set via update_caption() before each generation
@@ -103,6 +108,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             "keyscale": None,
             "language": None,
             "timesignature": None,
         }
         # Temperature settings for different generation phases (set per-generation)
@@ -143,6 +149,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Pre-compute token IDs for efficiency
         self._precompute_tokens()
         self._char_to_tokens: Dict[str, set] = {}  # Precomputed char -> token IDs mapping
         # Precompute token mappings once (O(vocab_size), runs once at init)
@@ -186,6 +202,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Build language prefix tree (similar to keyscale but for language codes)
         self.language_prefix_tree = self._build_language_prefix_tree()
         # Fixed strings for each state
         # IMPORTANT: Do NOT include trailing space after colon - tokenizer will handle spacing
         # All matching should be done at token level, not string level
@@ -196,6 +214,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             FSMState.BPM_NAME: "bpm:",
             FSMState.CAPTION_NAME: "caption:",
             FSMState.DURATION_NAME: "duration:",
             FSMState.KEYSCALE_NAME: "keyscale:",
             FSMState.LANGUAGE_NAME: "language:",
             FSMState.TIMESIG_NAME: "timesignature:",
@@ -211,17 +230,19 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         even if the field is user-provided (we still need to generate the field name).
         Args:
-            current_field: Current field name ("bpm", "caption", "duration", "keyscale", "language", "timesignature")
         Returns:
             Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
         """
         # New field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
-        field_order = ["bpm", "caption", "duration","keyscale", "language", "timesignature"]
         field_to_state = {
             "bpm": FSMState.BPM_NAME,
             "caption": FSMState.CAPTION_NAME,
             "duration": FSMState.DURATION_NAME,
             "keyscale": FSMState.KEYSCALE_NAME,
             "language": FSMState.LANGUAGE_NAME,
             "timesignature": FSMState.TIMESIG_NAME,
@@ -235,7 +256,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Find next field in order
         for i in range(current_idx + 1, len(field_order)):
             field = field_order[i]
             if field == "caption" and self.skip_caption:
                 continue
             if field == "language" and self.skip_language:
@@ -257,7 +281,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         }
         # Build transitions for all fields (even if user-provided, we still need to generate field name)
-        # Field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
         # BPM field: NAME -> VALUE -> next field (caption or duration)
         self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
@@ -271,6 +295,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Duration field: NAME -> VALUE -> next field
         self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
         self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
         # Keyscale field: NAME -> VALUE -> next field (language or timesignature)
         self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
@@ -284,6 +313,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Timesignature field: NAME -> VALUE -> THINK_END_TAG
         self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
         self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
     def set_skip_caption(self, skip: bool):
         """Set whether to skip caption generation and rebuild state transitions."""
@@ -366,13 +400,14 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 - "keyscale": Optional[str] - e.g., "G major"
                 - "language": Optional[str] - e.g., "en"
                 - "timesignature": Optional[str] - e.g., "4"
                 If None, clears all user-provided metadata.
         """
         if metadata is None:
             metadata = {}
         # Update user-provided metadata
-        for field in ["bpm", "caption", "duration", "keyscale", "language", "timesignature"]:
             if field in metadata:
                 self.user_provided_metadata[field] = metadata[field]
             else:
@@ -437,6 +472,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         # Vocab size
         self.vocab_size = len(self.tokenizer)
         # EOS token for duration-constrained codes generation
         self.eos_token_id = self.tokenizer.eos_token_id
@@ -531,7 +570,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
@@ -808,6 +847,133 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         print("=" * 60)
     def _precompute_char_token_mapping(self):
         """
         Precompute mapping from characters to token IDs and token decoded texts.
@@ -859,8 +1025,36 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         if self.debug:
             logger.debug(f"Precomputed char->token mapping for {len(self._char_to_tokens)} unique characters")
     def _get_trie_node_from_trie(self, trie: Dict, prefix: str) -> Optional[Dict]:
         """Get a trie node from a specific trie (helper for caption vs full trie)."""
         node = trie
@@ -870,6 +1064,108 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             node = node[char]
         return node
     def reset(self):
         """Reset the processor state for a new generation."""
         self.state = FSMState.THINK_TAG
@@ -1061,6 +1357,26 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         return newline_prob > max_digit_prob
     def _get_allowed_keyscale_tokens(self) -> List[int]:
         """
         Get allowed tokens for keyscale field using the precomputed prefix tree.
@@ -1265,6 +1581,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             "keyscale": "keyscale: ",
             "language": "language: ",
             "timesignature": "timesignature: ",
         }
         prefix = field_to_prefix[field_name]
         full_text = f"{prefix}{value}\n"
@@ -1428,9 +1745,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     # Allow free generation (no constraints) so LM can generate field name naturally
                     return scores
                 else:
-                    # It's indentation, continue caption
                     self.caption_after_newline = False
             # If caption is ending (LM generating next field name), allow free generation
             # and track the field name until we see colon
             if self.caption_ending:
@@ -1505,7 +1824,55 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     mask[0, self.newline_token] = 0
                 scores = scores + mask
         elif self.state == FSMState.KEYSCALE_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["keyscale"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
@@ -1561,7 +1928,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     mask[0, value_tokens[0]] = 0
                     scores = scores + mask
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
             # select the top-1 probability token from all valid first tokens
             if not self.accumulated_token_ids:
@@ -1780,6 +2147,20 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 if token_str.strip().isdigit():
                     self.accumulated_value += token_str.strip()
         elif self.state == FSMState.CAPTION_VALUE:
             # Track token count for 512 limit
             self.caption_token_count += 1
@@ -1787,8 +2168,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             # Accumulate caption text
             self.accumulated_value += token_str
-            # Track if this token is a newline (for transition detection)
-            if generated_token_id == self.newline_token:
                 # Mark that we need to check next token for field transition
                 self.caption_after_newline = True
             else:
@@ -1813,6 +2195,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     # Map field name to VALUE state
                     field_name_to_value_state = {
                         "duration": FSMState.DURATION_VALUE,
                         "keyscale": FSMState.KEYSCALE_VALUE,
                         "language": FSMState.LANGUAGE_VALUE,
                         "timesignature": FSMState.TIMESIG_VALUE,

     DURATION_NAME = auto()       # Generating "duration: "
     DURATION_VALUE = auto()      # Generating numeric value 10-600
     NEWLINE_AFTER_DURATION = auto()
+    GENRES_NAME = auto()         # Generating "genres: "
+    GENRES_VALUE = auto()        # Generating any non-empty string
+    NEWLINE_AFTER_GENRES = auto()
     KEYSCALE_NAME = auto()       # Generating "keyscale: "
     KEYSCALE_VALUE = auto()      # Generating keyscale pattern
     NEWLINE_AFTER_KEYSCALE = auto()
         tokenizer: AutoTokenizer,
         enabled: bool = True,
         debug: bool = False,
+        genres_vocab_path: Optional[str] = None,
+        skip_genres: bool = True,
     ):
         """
         Initialize the constrained logits processor.
         self.tokenizer = tokenizer
         self.enabled = enabled
         self.debug = debug
+        self.skip_genres = skip_genres
         self.skip_caption = False  # Set to True to skip caption field generation
         self.skip_language = False  # Set to True to skip language field generation
         self.caption: Optional[str] = None  # Set via update_caption() before each generation
             "keyscale": None,
             "language": None,
             "timesignature": None,
+            "genres": None,
         }
         # Temperature settings for different generation phases (set per-generation)
         # Pre-compute token IDs for efficiency
         self._precompute_tokens()
+        # Genres vocabulary for constrained decoding
+        self.genres_vocab_path = genres_vocab_path or os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "genres_vocab.txt"
+        )
+        self.genres_vocab: List[str] = []  # Full vocab
+        self.genres_vocab_mtime: float = 0.0
+        self.genres_trie: Dict = {}  # Trie for full vocab (fallback)
+        self.caption_genres_trie: Dict = {}  # Trie for caption-matched genres (priority)
+        self.caption_matched_genres: List[str] = []  # Genres matched from caption
         self._char_to_tokens: Dict[str, set] = {}  # Precomputed char -> token IDs mapping
         # Precompute token mappings once (O(vocab_size), runs once at init)
         # Build language prefix tree (similar to keyscale but for language codes)
         self.language_prefix_tree = self._build_language_prefix_tree()
+        self._load_genres_vocab()
         # Fixed strings for each state
         # IMPORTANT: Do NOT include trailing space after colon - tokenizer will handle spacing
         # All matching should be done at token level, not string level
             FSMState.BPM_NAME: "bpm:",
             FSMState.CAPTION_NAME: "caption:",
             FSMState.DURATION_NAME: "duration:",
+            FSMState.GENRES_NAME: "genres:",
             FSMState.KEYSCALE_NAME: "keyscale:",
             FSMState.LANGUAGE_NAME: "language:",
             FSMState.TIMESIG_NAME: "timesignature:",
         even if the field is user-provided (we still need to generate the field name).
         Args:
+            current_field: Current field name ("bpm", "caption", "duration", "genres", "keyscale", "language", "timesignature")
         Returns:
             Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
         """
         # New field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
+        # genres is optional and can be skipped
+        field_order = ["bpm", "caption", "duration", "genres", "keyscale", "language", "timesignature"]
         field_to_state = {
             "bpm": FSMState.BPM_NAME,
             "caption": FSMState.CAPTION_NAME,
             "duration": FSMState.DURATION_NAME,
+            "genres": FSMState.GENRES_NAME,
             "keyscale": FSMState.KEYSCALE_NAME,
             "language": FSMState.LANGUAGE_NAME,
             "timesignature": FSMState.TIMESIG_NAME,
         # Find next field in order
         for i in range(current_idx + 1, len(field_order)):
             field = field_order[i]
+            # Skip fields based on flags
+            if field == "genres" and self.skip_genres:
+                continue
             if field == "caption" and self.skip_caption:
                 continue
             if field == "language" and self.skip_language:
         }
         # Build transitions for all fields (even if user-provided, we still need to generate field name)
+        # Field order: bpm -> caption -> duration -> genres -> keyscale -> language -> timesignature
         # BPM field: NAME -> VALUE -> next field (caption or duration)
         self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
         # Duration field: NAME -> VALUE -> next field
         self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
         self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
+        # Genres field (only if not skipped): NAME -> VALUE -> next field
+        if not self.skip_genres:
+            self.next_state[FSMState.GENRES_NAME] = FSMState.GENRES_VALUE
+            self.next_state[FSMState.GENRES_VALUE] = self._get_next_field_state("genres")
         # Keyscale field: NAME -> VALUE -> next field (language or timesignature)
         self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
         # Timesignature field: NAME -> VALUE -> THINK_END_TAG
         self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
         self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
+    def set_skip_genres(self, skip: bool):
+        """Set whether to skip genres generation and rebuild state transitions."""
+        self.skip_genres = skip
+        self._build_state_transitions()
     def set_skip_caption(self, skip: bool):
         """Set whether to skip caption generation and rebuild state transitions."""
                 - "keyscale": Optional[str] - e.g., "G major"
                 - "language": Optional[str] - e.g., "en"
                 - "timesignature": Optional[str] - e.g., "4"
+                - "genres": Optional[str] - e.g., "Pop Rock"
                 If None, clears all user-provided metadata.
         """
         if metadata is None:
             metadata = {}
         # Update user-provided metadata
+        for field in ["bpm", "caption", "duration", "keyscale", "language", "timesignature", "genres"]:
             if field in metadata:
                 self.user_provided_metadata[field] = metadata[field]
             else:
         # Vocab size
         self.vocab_size = len(self.tokenizer)
+        # Comma token for multi-genre support
+        comma_tokens = self.tokenizer.encode(",", add_special_tokens=False)
+        self.comma_token = comma_tokens[-1] if comma_tokens else None
         # EOS token for duration-constrained codes generation
         self.eos_token_id = self.tokenizer.eos_token_id
         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
         print("=" * 60)
+    def _load_genres_vocab(self):
+        """
+        Load genres vocabulary from file. Supports hot reload by checking file mtime.
+        File format: one genre per line, lines starting with # are comments.
+        """
+        if not os.path.exists(self.genres_vocab_path):
+            if self.debug:
+                logger.debug(f"Genres vocab file not found: {self.genres_vocab_path}")
+            return
+        try:
+            mtime = os.path.getmtime(self.genres_vocab_path)
+            if mtime <= self.genres_vocab_mtime:
+                return  # File hasn't changed
+            with open(self.genres_vocab_path, 'r', encoding='utf-8') as f:
+                genres = []
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        genres.append(line.lower())
+                self.genres_vocab = genres
+                self.genres_vocab_mtime = mtime
+                self._build_genres_trie()
+                if self.debug:
+                    logger.debug(f"Loaded {len(self.genres_vocab)} genres from {self.genres_vocab_path}")
+        except Exception as e:
+            logger.warning(f"Failed to load genres vocab: {e}")
+    def _build_genres_trie(self):
+        """
+        Build a trie (prefix tree) from genres vocabulary for efficient prefix matching.
+        Each node is a dict with:
+          - '_end': True if this node represents a complete genre
+          - other keys: next characters in the trie
+        """
+        self.genres_trie = {}
+        for genre in self.genres_vocab:
+            node = self.genres_trie
+            for char in genre:
+                if char not in node:
+                    node[char] = {}
+                node = node[char]
+            node['_end'] = True  # Mark end of a complete genre
+        if self.debug:
+            logger.debug(f"Built genres trie with {len(self.genres_vocab)} entries")
+    def _extract_caption_genres(self, caption: str):
+        """
+        Extract genres from the user's caption that match entries in the vocabulary.
+        This creates a smaller trie for faster and more relevant genre generation.
+        Strategy (optimized - O(words * max_genre_len) instead of O(vocab_size)):
+        1. Extract words/phrases from caption
+        2. For each word, use trie to find all vocab entries that START with this word
+        3. Build a separate trie from matched genres
+        """
+        if not caption or not self.genres_vocab:
+            return
+        caption_lower = caption.lower()
+        matched_genres = set()
+        # Extract words from caption (split by common delimiters)
+        import re
+        words = re.split(r'[,\s\-_/\\|]+', caption_lower)
+        words = [w.strip() for w in words if w.strip() and len(w.strip()) >= 2]
+        # For each word, find genres in trie that start with this word
+        for word in words:
+            # Find all genres starting with this word using trie traversal
+            node = self._get_genres_trie_node(word)
+            if node is not None:
+                # Collect all complete genres under this node
+                self._collect_complete_genres(node, word, matched_genres)
+        # Also check if any word appears as a substring in short genres (< 20 chars)
+        # This is a quick check for common single-word genres
+        genres_set = set(self.genres_vocab)
+        for word in words:
+            if word in genres_set:
+                matched_genres.add(word)
+        if not matched_genres:
+            if self.debug:
+                logger.debug(f"No genres matched in caption, using full vocab")
+            return
+        # Build a trie from matched genres
+        self.caption_matched_genres = list(matched_genres)
+        self.caption_genres_trie = {}
+        for genre in matched_genres:
+            node = self.caption_genres_trie
+            for char in genre:
+                if char not in node:
+                    node[char] = {}
+                node = node[char]
+            node['_end'] = True
+        if self.debug:
+            logger.debug(f"Matched {len(matched_genres)} genres from caption: {list(matched_genres)[:5]}...")
+    def _collect_complete_genres(self, node: Dict, prefix: str, result: set, max_depth: int = 50):
+        """
+        Recursively collect all complete genres under a trie node.
+        Limited depth to avoid too many matches.
+        """
+        if max_depth <= 0:
+            return
+        if node.get('_end', False):
+            result.add(prefix)
+        # Limit total collected genres to avoid slowdown
+        if len(result) >= 100:
+            return
+        for char, child_node in node.items():
+            if char not in ('_end', '_tokens'):
+                self._collect_complete_genres(child_node, prefix + char, result, max_depth - 1)
     def _precompute_char_token_mapping(self):
         """
         Precompute mapping from characters to token IDs and token decoded texts.
         if self.debug:
             logger.debug(f"Precomputed char->token mapping for {len(self._char_to_tokens)} unique characters")
+    def _try_reload_genres_vocab(self):
+        """Check if genres vocab file has been updated and reload if necessary."""
+        if not os.path.exists(self.genres_vocab_path):
+            return
+        try:
+            mtime = os.path.getmtime(self.genres_vocab_path)
+            if mtime > self.genres_vocab_mtime:
+                self._load_genres_vocab()
+        except Exception:
+            pass  # Ignore errors during hot reload check
+    def _get_genres_trie_node(self, prefix: str) -> Optional[Dict]:
+        """
+        Get the trie node for a given prefix.
+        Returns None if the prefix is not valid (no genres start with this prefix).
+        """
+        node = self.genres_trie
+        for char in prefix.lower():
+            if char not in node:
+                return None
+            node = node[char]
+        return node
+    def _is_complete_genre(self, text: str) -> bool:
+        """Check if the given text is a complete genre in the vocabulary."""
+        node = self._get_genres_trie_node(text.strip())
+        return node is not None and node.get('_end', False)
     def _get_trie_node_from_trie(self, trie: Dict, prefix: str) -> Optional[Dict]:
         """Get a trie node from a specific trie (helper for caption vs full trie)."""
         node = trie
             node = node[char]
         return node
+    def _get_allowed_genres_tokens(self) -> List[int]:
+        """
+        Get allowed tokens for genres field based on trie matching.
+        The entire genres string (including commas) must match a complete entry in the vocab.
+        For example, if vocab contains "pop, rock, jazz", the generated string must exactly
+        match that entry - we don't treat commas as separators for individual genres.
+        Strategy:
+        1. If caption-matched genres exist, use that smaller trie first (faster + more relevant)
+        2. If no caption matches or prefix not in caption trie, fallback to full vocab trie
+        3. Get valid next characters from current trie node
+        4. For each candidate token, verify the full decoded text forms a valid trie prefix
+        """
+        if not self.genres_vocab:
+            # No vocab loaded, allow all except newline if empty
+            return []
+        # Use the full accumulated value (don't split by comma - treat as single entry)
+        accumulated = self.accumulated_value.lower()
+        current_genre_prefix = accumulated.strip()
+        # Determine which trie to use: caption-matched (priority) or full vocab (fallback)
+        use_caption_trie = False
+        current_node = None
+        # Try caption-matched trie first if available
+        if self.caption_genres_trie:
+            if current_genre_prefix == "":
+                current_node = self.caption_genres_trie
+                use_caption_trie = True
+            else:
+                current_node = self._get_trie_node_from_trie(self.caption_genres_trie, current_genre_prefix)
+                if current_node is not None:
+                    use_caption_trie = True
+        # Fallback to full vocab trie
+        if current_node is None:
+            if current_genre_prefix == "":
+                current_node = self.genres_trie
+            else:
+                current_node = self._get_genres_trie_node(current_genre_prefix)
+        if current_node is None:
+            # Invalid prefix, force newline to end
+            if self.newline_token:
+                return [self.newline_token]
+            return []
+        # Get valid next characters from trie node
+        valid_next_chars = set(k for k in current_node.keys() if k not in ('_end', '_tokens'))
+        # If current value is a complete genre, allow newline to end
+        is_complete = current_node.get('_end', False)
+        if not valid_next_chars:
+            # No more characters to match, only allow newline if complete
+            allowed = set()
+            if is_complete and self.newline_token:
+                allowed.add(self.newline_token)
+            return list(allowed)
+        # Collect candidate tokens based on first character
+        candidate_tokens = set()
+        for char in valid_next_chars:
+            if char in self._char_to_tokens:
+                candidate_tokens.update(self._char_to_tokens[char])
+        # Select the appropriate trie for validation
+        active_trie = self.caption_genres_trie if use_caption_trie else self.genres_trie
+        # Validate each candidate token: check if prefix + decoded_token is a valid trie prefix
+        allowed = set()
+        for token_id in candidate_tokens:
+            # Use precomputed decoded text (already normalized)
+            decoded_normalized = self._token_to_text.get(token_id, "")
+            if not decoded_normalized or not decoded_normalized.strip():
+                # Token decodes to empty or only whitespace - allow if space/comma is a valid next char
+                if ' ' in valid_next_chars or ',' in valid_next_chars:
+                    allowed.add(token_id)
+                continue
+            # Build new prefix by appending decoded token
+            # Handle space-prefixed tokens (e.g., " rock" from "pop rock")
+            if decoded_normalized.startswith(' ') or decoded_normalized.startswith(','):
+                # Token has leading space/comma - append directly
+                new_prefix = current_genre_prefix + decoded_normalized
+            else:
+                new_prefix = current_genre_prefix + decoded_normalized
+            # Check if new_prefix is a valid prefix in the active trie
+            new_node = self._get_trie_node_from_trie(active_trie, new_prefix)
+            if new_node is not None:
+                allowed.add(token_id)
+        # If current value is a complete genre, also allow newline
+        if is_complete and self.newline_token:
+            allowed.add(self.newline_token)
+        return list(allowed)
     def reset(self):
         """Reset the processor state for a new generation."""
         self.state = FSMState.THINK_TAG
         return newline_prob > max_digit_prob
+    def _should_end_text_field(self, logits: torch.Tensor) -> bool:
+        """
+        Determine if we should end a text field (genres).
+        Returns True if P(newline) > P(any other token) AND we have some content.
+        """
+        if not self.accumulated_value.strip():
+            return False  # Need at least some content
+        probs = torch.softmax(logits, dim=-1)
+        newline_prob = probs[0, self.newline_token].item() if self.newline_token else 0
+        # Get max probability among non-newline tokens
+        masked_probs = probs.clone()
+        if self.newline_token:
+            masked_probs[0, self.newline_token] = 0
+        max_other_prob = masked_probs[0].max().item()
+        return newline_prob > max_other_prob
     def _get_allowed_keyscale_tokens(self) -> List[int]:
         """
         Get allowed tokens for keyscale field using the precomputed prefix tree.
             "keyscale": "keyscale: ",
             "language": "language: ",
             "timesignature": "timesignature: ",
+            "genres": "genres: ",
         }
         prefix = field_to_prefix[field_name]
         full_text = f"{prefix}{value}\n"
                     # Allow free generation (no constraints) so LM can generate field name naturally
                     return scores
                 else:
+                    # It's indentation, continue caption (don't transition!)
                     self.caption_after_newline = False
+                    # Continue normal caption generation
+                    # Fall through to caption constraints below
             # If caption is ending (LM generating next field name), allow free generation
             # and track the field name until we see colon
             if self.caption_ending:
                     mask[0, self.newline_token] = 0
                 scores = scores + mask
+        elif self.state == FSMState.GENRES_VALUE:
+            # Check if field is user-provided and we haven't started injecting yet
+            if self.user_provided_metadata["genres"] is not None and not self.user_field_token_queue and not self.accumulated_value:
+                # Initialize token queue with field value tokens (value + newline)
+                value = self.user_provided_metadata["genres"]
+                value_text = f" {value}\n"
+                value_tokens = self.tokenizer.encode(value_text, add_special_tokens=False)
+                if value_tokens:
+                    self.user_field_token_queue = value_tokens
+                    self.current_user_field = "genres"
+                    # Inject first token
+                    mask[0, value_tokens[0]] = 0
+                    scores = scores + mask
+                    return scores
+            # Try to hot-reload genres vocab if file has changed
+            self._try_reload_genres_vocab()
+            # Get allowed tokens based on genres vocabulary
+            allowed = self._get_allowed_genres_tokens()
+            if allowed:
+                # Use vocabulary-constrained decoding
+                for t in allowed:
+                    mask[0, t] = 0
+                scores = scores + mask
+            elif self.genres_vocab:
+                # Vocab is loaded but no valid continuation found
+                # Force newline to end the field
+                if self.newline_token:
+                    mask[0, self.newline_token] = 0
+                    if self.debug:
+                        logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
+                scores = scores + mask
+            else:
+                # Fallback: no vocab loaded, use probability-based ending
+                if self._should_end_text_field(scores):
+                    if self.newline_token:
+                        mask[0, self.newline_token] = 0
+                        self._transition_to_next_state()
+                    scores = scores + mask
+                else:
+                    # Allow any token except newline if we don't have content yet
+                    if not self.accumulated_value.strip():
+                        if self.newline_token:
+                            scores[0, self.newline_token] = float('-inf')
+                    # Otherwise, don't constrain (fallback behavior)
         elif self.state == FSMState.KEYSCALE_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
             if self.user_provided_metadata["keyscale"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
                     mask[0, value_tokens[0]] = 0
                     scores = scores + mask
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
             # select the top-1 probability token from all valid first tokens
             if not self.accumulated_token_ids:
                 if token_str.strip().isdigit():
                     self.accumulated_value += token_str.strip()
+        elif self.state == FSMState.GENRES_VALUE:
+            if generated_token_id == self.newline_token:
+                # Newline ends the field
+                self._transition_to_next_state()
+                # IMPORTANT: After state transition, if new state is a fixed_strings state,
+                # we should NOT update position_in_state with the newline token length,
+                # because that token belongs to the old state, not the new state.
+                # Return early to avoid the fixed_strings update logic below.
+                if self.state in self.fixed_strings:
+                    return
+            else:
+                # Genres still uses string-based trie, so keep accumulated_value
+                self.accumulated_value += token_str
         elif self.state == FSMState.CAPTION_VALUE:
             # Track token count for 512 limit
             self.caption_token_count += 1
             # Accumulate caption text
             self.accumulated_value += token_str
+            # Track if this token contains a newline (for transition detection)
+            # Token may be '\n' alone or combined with other chars like '.\n'
+            if '\n' in token_str:
                 # Mark that we need to check next token for field transition
                 self.caption_after_newline = True
             else:
                     # Map field name to VALUE state
                     field_name_to_value_state = {
                         "duration": FSMState.DURATION_VALUE,
+                        "genres": FSMState.GENRES_VALUE,
                         "keyscale": FSMState.KEYSCALE_VALUE,
                         "language": FSMState.LANGUAGE_VALUE,
                         "timesignature": FSMState.TIMESIG_VALUE,

acestep/gradio_ui.py CHANGED Viewed

@@ -607,6 +607,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     info="Generate language in CoT (chain-of-thought)",
                     scale=1,
                 )
             with gr.Row():
                 audio_cover_strength = gr.Slider(
@@ -618,11 +624,21 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     info="Control how many denoising steps use LM-generated codes",
                     scale=1,
                 )
                 output_alignment_preference = gr.Checkbox(
                     label="Output Attention Focus Score (disabled)",
                     value=False,
                     info="Output attention focus score analysis",
                     interactive=False,
                     scale=1,
                 )
@@ -632,10 +648,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
             think_checkbox = gr.Checkbox(
                 label="Think",
                 value=True,
-                info="Enable llm generate hints",
                 scale=1,
             )
             generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive, scale=10)
     return {
         "service_config_accordion": service_config_accordion,
@@ -695,6 +715,9 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,
         "generate_btn": generate_btn,
     }
@@ -720,7 +743,7 @@ def create_results_section(dit_handler) -> dict:
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
-                        "Send To Src Audio",
                         variant="secondary",
                         size="sm",
                         scale=1
@@ -731,6 +754,17 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
             with gr.Column():
                 generated_audio_2 = gr.Audio(
                     label="🎵 Generated Music (Sample 2)",
@@ -739,7 +773,7 @@ def create_results_section(dit_handler) -> dict:
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
-                        "Send To Src Audio",
                         variant="secondary",
                         size="sm",
                         scale=1
@@ -750,6 +784,17 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
         with gr.Accordion("📁 Batch Results & Generation Details", open=False):
             generated_audio_batch = gr.File(
@@ -780,6 +825,10 @@ def create_results_section(dit_handler) -> dict:
         "send_to_src_btn_2": send_to_src_btn_2,
         "save_btn_1": save_btn_1,
         "save_btn_2": save_btn_2,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
         "align_score_1": align_score_1,
@@ -1042,11 +1091,12 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             gr.Warning(f"Error loading example: {str(e)}")
             return "", "", True, None, None, "", "", ""
-    def sample_example_smart(task_type: str):
         """Smart sample function that uses LM if initialized, otherwise falls back to examples
         Args:
             task_type: The task type (e.g., "text2music")
         Returns:
             Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
@@ -1060,6 +1110,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                     audio_codes="NO USER INPUT",
                     use_constrained_decoding=True,
                     temperature=0.85,
                 )
                 if metadata:
@@ -1094,7 +1145,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                     if timesignature_value in [None, "N/A"]:
                         timesignature_value = ''
-                    gr.Info("🤖 Generated example using LM (Language Model)")
                     return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
                 else:
                     gr.Warning("Failed to generate example using LM, falling back to examples directory")
@@ -1285,6 +1336,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
         progress=gr.Progress(track_tqdm=True)
     ):
         # If think is enabled (llm_dit mode) and use_cot_metas is True, generate audio codes using LM first
@@ -1342,6 +1394,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 use_cot_caption=use_cot_caption,
                 use_cot_language=use_cot_language,
                 is_format_caption=is_format_caption,
             )
             # Store LM-generated metadata and audio codes for display
@@ -1471,7 +1524,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["use_cot_metas"],
             generation_section["use_cot_caption"],
             generation_section["use_cot_language"],
-            results_section["is_format_caption_state"]
         ],
         outputs=[
             results_section["generated_audio_1"],
@@ -1720,15 +1774,18 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     # Sample button - smart sample (uses LM if initialized, otherwise examples)
     # Need to add is_format_caption return value to sample_example_smart
-    def sample_example_smart_with_flag(task_type: str):
         """Wrapper for sample_example_smart that adds is_format_caption flag"""
-        result = sample_example_smart(task_type)
         # Add True at the end to set is_format_caption
         return result + (True,)
     generation_section["sample_btn"].click(
         fn=sample_example_smart_with_flag,
-        inputs=[generation_section["task_type"]],
         outputs=[
             generation_section["captions"],
             generation_section["lyrics"],
@@ -1743,13 +1800,14 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     )
     # Transcribe audio codes to metadata (or generate example if empty)
-    def transcribe_audio_codes(audio_code_string):
         """
         Transcribe audio codes to metadata using LLM understanding.
         If audio_code_string is empty, generate a sample example instead.
         Args:
             audio_code_string: String containing audio codes (or empty for example generation)
         Returns:
             Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature)
@@ -1763,7 +1821,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             audio_code_string = "NO USER INPUT"
         # Call LLM understanding
-        metadata, status = llm_handler.understand_audio_from_codes(audio_codes=audio_code_string, use_constrained_decoding=True)
         # Extract fields for UI update
         caption = metadata.get('caption', '')
@@ -1818,7 +1880,10 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     generation_section["transcribe_btn"].click(
         fn=transcribe_audio_codes,
-        inputs=[generation_section["text2music_audio_code_string"]],
         outputs=[
             results_section["status_output"],       # Show status
             generation_section["captions"],         # Update caption field
@@ -1899,9 +1964,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         outputs=[generation_section["audio_uploads_accordion"]]
     )
-    # Save metadata handlers
     results_section["save_btn_1"].click(
-        fn=save_metadata,
         inputs=[
             generation_section["task_type"],
             generation_section["captions"],
@@ -1936,11 +2001,77 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["complete_track_classes"],
             results_section["lm_metadata_state"],
         ],
-        outputs=[]
     )
     results_section["save_btn_2"].click(
-        fn=save_metadata,
         inputs=[
             generation_section["task_type"],
             generation_section["captions"],
@@ -1975,7 +2106,73 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["complete_track_classes"],
             results_section["lm_metadata_state"],
         ],
-        outputs=[]
     )
     # Load metadata handler - triggered when file is uploaded via UploadButton
@@ -2017,4 +2214,152 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             results_section["is_format_caption_state"]
         ]
     )

                     info="Generate language in CoT (chain-of-thought)",
                     scale=1,
                 )
+                constrained_decoding_debug = gr.Checkbox(
+                    label="Constrained Decoding Debug",
+                    value=False,
+                    info="Enable debug logging for constrained decoding (check to see detailed logs)",
+                    scale=1,
+                )
             with gr.Row():
                 audio_cover_strength = gr.Slider(
                     info="Control how many denoising steps use LM-generated codes",
                     scale=1,
                 )
+                score_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=200.0,
+                    value=10.0,
+                    step=1.0,
+                    label="Quality Score Sensitivity",
+                    info="Lower = more sensitive to quality differences (default: 10.0)",
+                    scale=1,
+                )
                 output_alignment_preference = gr.Checkbox(
                     label="Output Attention Focus Score (disabled)",
                     value=False,
                     info="Output attention focus score analysis",
                     interactive=False,
+                    visible=False,
                     scale=1,
                 )
             think_checkbox = gr.Checkbox(
                 label="Think",
                 value=True,
                 scale=1,
             )
             generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive, scale=10)
+            instrumental_checkbox = gr.Checkbox(
+                label="Instrumental",
+                value=False,
+                scale=1,
+            )
     return {
         "service_config_accordion": service_config_accordion,
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,
         "generate_btn": generate_btn,
+        "instrumental_checkbox": instrumental_checkbox,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "score_scale": score_scale,
     }
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_1 = gr.Button(
+                        "🔗 Send To Src Audio",
                         variant="secondary",
                         size="sm",
                         scale=1
                         size="sm",
                         scale=1
                     )
+                    score_btn_1 = gr.Button(
+                        "📊 Score",
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_1 = gr.Textbox(
+                    label="Quality Score (Sample 1)",
+                    interactive=False,
+                    placeholder="Click 'Score' to calculate perplexity-based quality score"
+                )
             with gr.Column():
                 generated_audio_2 = gr.Audio(
                     label="🎵 Generated Music (Sample 2)",
                 )
                 with gr.Row(equal_height=True):
                     send_to_src_btn_2 = gr.Button(
+                        "🔗 Send To Src Audio",
                         variant="secondary",
                         size="sm",
                         scale=1
                         size="sm",
                         scale=1
                     )
+                    score_btn_2 = gr.Button(
+                        "📊 Score",
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_2 = gr.Textbox(
+                    label="Quality Score (Sample 2)",
+                    interactive=False,
+                    placeholder="Click 'Score' to calculate perplexity-based quality score"
+                )
         with gr.Accordion("📁 Batch Results & Generation Details", open=False):
             generated_audio_batch = gr.File(
         "send_to_src_btn_2": send_to_src_btn_2,
         "save_btn_1": save_btn_1,
         "save_btn_2": save_btn_2,
+        "score_btn_1": score_btn_1,
+        "score_btn_2": score_btn_2,
+        "score_display_1": score_display_1,
+        "score_display_2": score_display_2,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
         "align_score_1": align_score_1,
             gr.Warning(f"Error loading example: {str(e)}")
             return "", "", True, None, None, "", "", ""
+    def sample_example_smart(task_type: str, constrained_decoding_debug: bool = False):
         """Smart sample function that uses LM if initialized, otherwise falls back to examples
         Args:
             task_type: The task type (e.g., "text2music")
+            constrained_decoding_debug: Whether to enable debug logging for constrained decoding
         Returns:
             Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
                     audio_codes="NO USER INPUT",
                     use_constrained_decoding=True,
                     temperature=0.85,
+                    constrained_decoding_debug=constrained_decoding_debug,
                 )
                 if metadata:
                     if timesignature_value in [None, "N/A"]:
                         timesignature_value = ''
+                    gr.Info("🤖 Generated example using LM")
                     return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
                 else:
                     gr.Warning("Failed to generate example using LM, falling back to examples directory")
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
+        constrained_decoding_debug,
         progress=gr.Progress(track_tqdm=True)
     ):
         # If think is enabled (llm_dit mode) and use_cot_metas is True, generate audio codes using LM first
                 use_cot_caption=use_cot_caption,
                 use_cot_language=use_cot_language,
                 is_format_caption=is_format_caption,
+                constrained_decoding_debug=constrained_decoding_debug,
             )
             # Store LM-generated metadata and audio codes for display
             generation_section["use_cot_metas"],
             generation_section["use_cot_caption"],
             generation_section["use_cot_language"],
+            results_section["is_format_caption_state"],
+            generation_section["constrained_decoding_debug"]
         ],
         outputs=[
             results_section["generated_audio_1"],
     # Sample button - smart sample (uses LM if initialized, otherwise examples)
     # Need to add is_format_caption return value to sample_example_smart
+    def sample_example_smart_with_flag(task_type: str, constrained_decoding_debug: bool):
         """Wrapper for sample_example_smart that adds is_format_caption flag"""
+        result = sample_example_smart(task_type, constrained_decoding_debug)
         # Add True at the end to set is_format_caption
         return result + (True,)
     generation_section["sample_btn"].click(
         fn=sample_example_smart_with_flag,
+        inputs=[
+            generation_section["task_type"],
+            generation_section["constrained_decoding_debug"]
+        ],
         outputs=[
             generation_section["captions"],
             generation_section["lyrics"],
     )
     # Transcribe audio codes to metadata (or generate example if empty)
+    def transcribe_audio_codes(audio_code_string, constrained_decoding_debug):
         """
         Transcribe audio codes to metadata using LLM understanding.
         If audio_code_string is empty, generate a sample example instead.
         Args:
             audio_code_string: String containing audio codes (or empty for example generation)
+            constrained_decoding_debug: Whether to enable debug logging for constrained decoding
         Returns:
             Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature)
             audio_code_string = "NO USER INPUT"
         # Call LLM understanding
+        metadata, status = llm_handler.understand_audio_from_codes(
+            audio_codes=audio_code_string,
+            use_constrained_decoding=True,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
         # Extract fields for UI update
         caption = metadata.get('caption', '')
     generation_section["transcribe_btn"].click(
         fn=transcribe_audio_codes,
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["constrained_decoding_debug"]
+        ],
         outputs=[
             results_section["status_output"],       # Show status
             generation_section["captions"],         # Update caption field
         outputs=[generation_section["audio_uploads_accordion"]]
     )
+    # Save metadata handlers - use JavaScript to trigger automatic download
     results_section["save_btn_1"].click(
+        fn=None,
         inputs=[
             generation_section["task_type"],
             generation_section["captions"],
             generation_section["complete_track_classes"],
             results_section["lm_metadata_state"],
         ],
+        outputs=None,
+        js="""
+        (task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature, audio_duration,
+         batch_size_input, inference_steps, guidance_scale, seed, random_seed_checkbox,
+         use_adg, cfg_interval_start, cfg_interval_end, audio_format,
+         lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+         use_cot_caption, use_cot_language, audio_cover_strength,
+         think_checkbox, text2music_audio_code_string, repainting_start, repainting_end,
+         track_name, complete_track_classes, lm_metadata) => {
+            // Create metadata object
+            const metadata = {
+                saved_at: new Date().toISOString(),
+                task_type: task_type,
+                caption: captions || "",
+                lyrics: lyrics || "",
+                vocal_language: vocal_language,
+                bpm: bpm,
+                keyscale: key_scale || "",
+                timesignature: time_signature || "",
+                duration: audio_duration,
+                batch_size: batch_size_input,
+                inference_steps: inference_steps,
+                guidance_scale: guidance_scale,
+                seed: seed,
+                random_seed: random_seed_checkbox,
+                use_adg: use_adg,
+                cfg_interval_start: cfg_interval_start,
+                cfg_interval_end: cfg_interval_end,
+                audio_format: audio_format,
+                lm_temperature: lm_temperature,
+                lm_cfg_scale: lm_cfg_scale,
+                lm_top_k: lm_top_k,
+                lm_top_p: lm_top_p,
+                lm_negative_prompt: lm_negative_prompt,
+                use_cot_caption: use_cot_caption,
+                use_cot_language: use_cot_language,
+                audio_cover_strength: audio_cover_strength,
+                think: think_checkbox,
+                audio_codes: text2music_audio_code_string || "",
+                repainting_start: repainting_start,
+                repainting_end: repainting_end,
+                track_name: track_name,
+                complete_track_classes: complete_track_classes || []
+            };
+            if (lm_metadata) {
+                metadata.lm_generated_metadata = lm_metadata;
+            }
+            // Create JSON string
+            const jsonStr = JSON.stringify(metadata, null, 2);
+            // Create blob and download
+            const blob = new Blob([jsonStr], { type: 'application/json' });
+            const url = URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.href = url;
+            const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace('T', '_').split('.')[0];
+            a.download = `generation_params_${timestamp}.json`;
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+            URL.revokeObjectURL(url);
+            return Array(32).fill(null);
+        }
+        """
     )
     results_section["save_btn_2"].click(
+        fn=None,
         inputs=[
             generation_section["task_type"],
             generation_section["captions"],
             generation_section["complete_track_classes"],
             results_section["lm_metadata_state"],
         ],
+        outputs=None,
+        js="""
+        (task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature, audio_duration,
+         batch_size_input, inference_steps, guidance_scale, seed, random_seed_checkbox,
+         use_adg, cfg_interval_start, cfg_interval_end, audio_format,
+         lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+         use_cot_caption, use_cot_language, audio_cover_strength,
+         think_checkbox, text2music_audio_code_string, repainting_start, repainting_end,
+         track_name, complete_track_classes, lm_metadata) => {
+            // Create metadata object
+            const metadata = {
+                saved_at: new Date().toISOString(),
+                task_type: task_type,
+                caption: captions || "",
+                lyrics: lyrics || "",
+                vocal_language: vocal_language,
+                bpm: bpm,
+                keyscale: key_scale || "",
+                timesignature: time_signature || "",
+                duration: audio_duration,
+                batch_size: batch_size_input,
+                inference_steps: inference_steps,
+                guidance_scale: guidance_scale,
+                seed: seed,
+                random_seed: random_seed_checkbox,
+                use_adg: use_adg,
+                cfg_interval_start: cfg_interval_start,
+                cfg_interval_end: cfg_interval_end,
+                audio_format: audio_format,
+                lm_temperature: lm_temperature,
+                lm_cfg_scale: lm_cfg_scale,
+                lm_top_k: lm_top_k,
+                lm_top_p: lm_top_p,
+                lm_negative_prompt: lm_negative_prompt,
+                use_cot_caption: use_cot_caption,
+                use_cot_language: use_cot_language,
+                audio_cover_strength: audio_cover_strength,
+                think: think_checkbox,
+                audio_codes: text2music_audio_code_string || "",
+                repainting_start: repainting_start,
+                repainting_end: repainting_end,
+                track_name: track_name,
+                complete_track_classes: complete_track_classes || []
+            };
+            if (lm_metadata) {
+                metadata.lm_generated_metadata = lm_metadata;
+            }
+            // Create JSON string
+            const jsonStr = JSON.stringify(metadata, null, 2);
+            // Create blob and download
+            const blob = new Blob([jsonStr], { type: 'application/json' });
+            const url = URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.href = url;
+            const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace('T', '_').split('.')[0];
+            a.download = `generation_params_${timestamp}.json`;
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+            URL.revokeObjectURL(url);
+            return Array(32).fill(null);
+        }
+        """
     )
     # Load metadata handler - triggered when file is uploaded via UploadButton
             results_section["is_format_caption_state"]
         ]
     )
+    # Instrumental checkbox handler - auto-fill [Instrumental] when checked
+    def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
+        """
+        Handle instrumental checkbox changes.
+        When checked: if no lyrics, fill with [Instrumental]
+        When unchecked: if lyrics is [Instrumental], clear it
+        """
+        if instrumental_checked:
+            # If checked and no lyrics, fill with [Instrumental]
+            if not current_lyrics or not current_lyrics.strip():
+                return "[Instrumental]"
+            else:
+                # Has lyrics, don't change
+                return current_lyrics
+        else:
+            # If unchecked and lyrics is exactly [Instrumental], clear it
+            if current_lyrics and current_lyrics.strip() == "[Instrumental]":
+                return ""
+            else:
+                # Has other lyrics, don't change
+                return current_lyrics
+    generation_section["instrumental_checkbox"].change(
+        fn=handle_instrumental_checkbox,
+        inputs=[generation_section["instrumental_checkbox"], generation_section["lyrics"]],
+        outputs=[generation_section["lyrics"]]
+    )
+    # Score calculation handlers
+    def calculate_score_handler(audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
+        """
+        Calculate perplexity-based quality score for generated audio.
+        Args:
+            audio_codes_str: Generated audio codes string
+            caption: Caption text used for generation
+            lyrics: Lyrics text used for generation
+            lm_metadata: LM-generated metadata dictionary (from CoT generation)
+            bpm: BPM value
+            key_scale: Key scale value
+            time_signature: Time signature value
+            audio_duration: Audio duration value
+            vocal_language: Vocal language value
+            score_scale: Sensitivity scale parameter (lower = more sensitive)
+        Returns:
+            Score display string
+        """
+        from acestep.test_time_scaling import calculate_perplexity, perplexity_to_score
+        if not llm_handler.llm_initialized:
+            return "❌ LLM not initialized. Please initialize 5Hz LM first."
+        if not audio_codes_str or not audio_codes_str.strip():
+            return "❌ No audio codes available. Please generate music first."
+        try:
+            # Build metadata dictionary from both LM metadata and user inputs
+            metadata = {}
+            # Priority 1: Use LM-generated metadata if available
+            if lm_metadata and isinstance(lm_metadata, dict):
+                metadata.update(lm_metadata)
+            # Priority 2: Add user-provided metadata (if not already in LM metadata)
+            if bpm is not None and 'bpm' not in metadata:
+                try:
+                    metadata['bpm'] = int(bpm)
+                except:
+                    pass
+            if caption and 'caption' not in metadata:
+                metadata['caption'] = caption
+            if audio_duration is not None and audio_duration > 0 and 'duration' not in metadata:
+                try:
+                    metadata['duration'] = int(audio_duration)
+                except:
+                    pass
+            if key_scale and key_scale.strip() and 'keyscale' not in metadata:
+                metadata['keyscale'] = key_scale.strip()
+            if vocal_language and vocal_language.strip() and 'language' not in metadata:
+                metadata['language'] = vocal_language.strip()
+            if time_signature and time_signature.strip() and 'timesignature' not in metadata:
+                metadata['timesignature'] = time_signature.strip()
+            # Calculate perplexity
+            perplexity, status = calculate_perplexity(
+                llm_handler=llm_handler,
+                audio_codes=audio_codes_str,
+                caption=caption or "",
+                lyrics=lyrics or "",
+                metadata=metadata if metadata else None,
+                temperature=1.0
+            )
+            # Convert perplexity to normalized score [0, 1] (higher is better)
+            normalized_score = perplexity_to_score(perplexity, scale=score_scale)
+            # Format display string
+            if perplexity == float('inf'):
+                return f"❌ Scoring failed: {status}"
+            else:
+                return f"✅ Quality Score: {normalized_score:.4f} (range: 0-1, higher is better)\nPerplexity: {perplexity:.4f}\nSensitivity: {score_scale}\n{status}"
+        except Exception as e:
+            import traceback
+            error_msg = f"❌ Error calculating score: {str(e)}\n{traceback.format_exc()}"
+            return error_msg
+    # Connect score buttons to handlers
+    results_section["score_btn_1"].click(
+        fn=calculate_score_handler,
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            results_section["lm_metadata_state"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+            generation_section["vocal_language"],
+            generation_section["score_scale"]
+        ],
+        outputs=[results_section["score_display_1"]]
+    )
+    results_section["score_btn_2"].click(
+        fn=calculate_score_handler,
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            results_section["lm_metadata_state"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+            generation_section["vocal_language"],
+            generation_section["score_scale"]
+        ],
+        outputs=[results_section["score_display_2"]]
+    )

acestep/llm_inference.py CHANGED Viewed

@@ -39,6 +39,9 @@ class LLMHandler:
         # Shared constrained decoding processor (initialized once when LLM is loaded)
         self.constrained_processor: Optional[MetadataConstrainedLogitsProcessor] = None
     def get_available_5hz_lm_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-5Hz-lm-'"""
@@ -246,6 +249,7 @@ class LLMHandler:
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
         skip_caption: bool = False,
         skip_language: bool = False,
         generation_phase: str = "cot",
@@ -276,6 +280,7 @@ class LLMHandler:
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             # Set skip_caption and skip_language based on flags
             self.constrained_processor.set_skip_caption(skip_caption)
             self.constrained_processor.set_skip_language(skip_language)
             # Set generation phase for phase-aware processing
@@ -347,6 +352,7 @@ class LLMHandler:
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
         skip_caption: bool = False,
         skip_language: bool = False,
         generation_phase: str = "cot",
@@ -376,6 +382,7 @@ class LLMHandler:
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             # Set skip_caption and skip_language based on flags
             self.constrained_processor.set_skip_caption(skip_caption)
             self.constrained_processor.set_skip_language(skip_language)
             # Set generation phase for phase-aware processing
@@ -597,6 +604,7 @@ class LLMHandler:
                     "user_metadata": user_metadata,
                     "skip_caption": not use_cot_caption,
                     "skip_language": not use_cot_language,
                     "generation_phase": "cot",
                     # Pass context for building unconditional prompt in CoT phase
                     "caption": caption,
@@ -863,7 +871,6 @@ class LLMHandler:
                 - bpm: int or str
                 - caption: str
                 - duration: int or str
-                - genres: str
                 - keyscale: str
                 - language: str
                 - timesignature: str
@@ -901,6 +908,7 @@ class LLMHandler:
                 "user_metadata": None,  # No user metadata injection
                 "skip_caption": False,  # Generate caption
                 "skip_language": False,  # Generate language
                 "generation_phase": "understand",  # Understanding phase: generate CoT metadata, then free-form lyrics
                 # Context for building unconditional prompt
                 "caption": "",
@@ -1015,6 +1023,7 @@ class LLMHandler:
         user_metadata = cfg.get("user_metadata")  # User-provided metadata fields
         skip_caption = cfg.get("skip_caption", False)  # Skip caption generation in CoT
         skip_language = cfg.get("skip_language", False)  # Skip language generation in CoT
         generation_phase = cfg.get("generation_phase", "cot")  # "cot" or "codes"
         # Additional context for codes phase unconditional prompt building
         caption = cfg.get("caption", "")
@@ -1036,6 +1045,7 @@ class LLMHandler:
                     target_duration=target_duration,
                     user_metadata=user_metadata,
                     stop_at_reasoning=stop_at_reasoning,
                     skip_caption=skip_caption,
                     skip_language=skip_language,
                     generation_phase=generation_phase,
@@ -1059,6 +1069,7 @@ class LLMHandler:
                 target_duration=target_duration,
                 user_metadata=user_metadata,
                 stop_at_reasoning=stop_at_reasoning,
                 skip_caption=skip_caption,
                 skip_language=skip_language,
                 generation_phase=generation_phase,
@@ -1521,3 +1532,51 @@ class LLMHandler:
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             logger.info(f"Offloaded LLM to CPU in {offload_time:.4f}s")

         # Shared constrained decoding processor (initialized once when LLM is loaded)
         self.constrained_processor: Optional[MetadataConstrainedLogitsProcessor] = None
+        # Shared HuggingFace model for perplexity calculation (when using vllm backend)
+        self._hf_model_for_scoring = None
     def get_available_5hz_lm_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-5Hz-lm-'"""
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
+        skip_genres: bool = True,
         skip_caption: bool = False,
         skip_language: bool = False,
         generation_phase: str = "cot",
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             # Set skip_caption and skip_language based on flags
+            self.constrained_processor.set_skip_genres(skip_genres)
             self.constrained_processor.set_skip_caption(skip_caption)
             self.constrained_processor.set_skip_language(skip_language)
             # Set generation phase for phase-aware processing
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         stop_at_reasoning: bool = False,
+        skip_genres: bool = True,
         skip_caption: bool = False,
         skip_language: bool = False,
         generation_phase: str = "cot",
             self.constrained_processor.set_user_metadata(user_metadata)
             self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
             # Set skip_caption and skip_language based on flags
+            self.constrained_processor.set_skip_genres(skip_genres)
             self.constrained_processor.set_skip_caption(skip_caption)
             self.constrained_processor.set_skip_language(skip_language)
             # Set generation phase for phase-aware processing
                     "user_metadata": user_metadata,
                     "skip_caption": not use_cot_caption,
                     "skip_language": not use_cot_language,
+                    "skip_genres": True,  # Generate genres
                     "generation_phase": "cot",
                     # Pass context for building unconditional prompt in CoT phase
                     "caption": caption,
                 - bpm: int or str
                 - caption: str
                 - duration: int or str
                 - keyscale: str
                 - language: str
                 - timesignature: str
                 "user_metadata": None,  # No user metadata injection
                 "skip_caption": False,  # Generate caption
                 "skip_language": False,  # Generate language
+                "skip_genres": False,  # Generate genres
                 "generation_phase": "understand",  # Understanding phase: generate CoT metadata, then free-form lyrics
                 # Context for building unconditional prompt
                 "caption": "",
         user_metadata = cfg.get("user_metadata")  # User-provided metadata fields
         skip_caption = cfg.get("skip_caption", False)  # Skip caption generation in CoT
         skip_language = cfg.get("skip_language", False)  # Skip language generation in CoT
+        skip_genres = cfg.get("skip_genres", False)  # Skip genres generation in CoT
         generation_phase = cfg.get("generation_phase", "cot")  # "cot" or "codes"
         # Additional context for codes phase unconditional prompt building
         caption = cfg.get("caption", "")
                     target_duration=target_duration,
                     user_metadata=user_metadata,
                     stop_at_reasoning=stop_at_reasoning,
+                    skip_genres=skip_genres,
                     skip_caption=skip_caption,
                     skip_language=skip_language,
                     generation_phase=generation_phase,
                 target_duration=target_duration,
                 user_metadata=user_metadata,
                 stop_at_reasoning=stop_at_reasoning,
+                skip_genres=skip_genres,
                 skip_caption=skip_caption,
                 skip_language=skip_language,
                 generation_phase=generation_phase,
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             logger.info(f"Offloaded LLM to CPU in {offload_time:.4f}s")
+    def get_hf_model_for_scoring(self):
+        """
+        Get HuggingFace model for perplexity scoring.
+        For vllm backend, loads HuggingFace model from disk (weights are cached by transformers).
+        For pt backend, returns the existing model.
+        Returns:
+            HuggingFace model instance
+        """
+        if self.llm_backend == "pt":
+            # For PyTorch backend, directly return the model
+            return self.llm
+        elif self.llm_backend == "vllm":
+            # For vllm backend, load HuggingFace model from disk
+            # Note: transformers caches model weights, so this doesn't duplicate disk I/O
+            if self._hf_model_for_scoring is None:
+                logger.info("Loading HuggingFace model for scoring (from checkpoint)")
+                # Get model path from vllm config
+                model_runner = self.llm.model_runner
+                model_path = model_runner.config.model
+                # Load HuggingFace model from the same checkpoint
+                # This will load the original unfused weights
+                import time
+                start_time = time.time()
+                self._hf_model_for_scoring = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    trust_remote_code=True,
+                    torch_dtype=self.dtype
+                )
+                load_time = time.time() - start_time
+                logger.info(f"HuggingFace model loaded in {load_time:.2f}s")
+                # Move to same device as vllm model
+                device = next(model_runner.model.parameters()).device
+                self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
+                self._hf_model_for_scoring.eval()
+                logger.info(f"HuggingFace model for scoring ready on {device}")
+            return self._hf_model_for_scoring
+        else:
+            raise ValueError(f"Unknown backend: {self.llm_backend}")

acestep/test_time_scaling.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Test-Time Scaling Module
+Implements perplexity-based scoring for generated audio codes
+"""
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Optional, Dict, Any
+from loguru import logger
+import yaml
+def perplexity_to_score(perplexity: float, scale: float = 100.0) -> float:
+    """
+    Convert perplexity to a normalized score in [0, 1] range.
+    Lower perplexity = higher score (better quality)
+    Uses exponential decay: score = exp(-perplexity / scale)
+    Args:
+        perplexity: Perplexity value (typically 1 to 1000+)
+        scale: Scale parameter to control score distribution (default 100.0)
+               - Smaller scale: more sensitive to perplexity changes
+               - Larger scale: less sensitive to perplexity changes
+    Returns:
+        Score in [0, 1] range, where 1 is perfect and 0 is worst
+    Examples:
+        perplexity=1   → score≈0.99  (excellent)
+        perplexity=50  → score≈0.61  (good if scale=100)
+        perplexity=100 → score≈0.37  (medium if scale=100)
+        perplexity=500 → score≈0.01  (poor if scale=100)
+    """
+    import math
+    return math.exp(-perplexity / scale)
+def calculate_perplexity(
+    llm_handler,
+    audio_codes: str,
+    caption: str = "",
+    lyrics: str = "",
+    metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 1.0,
+) -> Tuple[float, str]:
+    """
+    Calculate perplexity of generated audio codes conditioned on caption/lyrics/metadata.
+    This reverses the generation task: given audio codes as input, measure how well
+    the model can predict the CoT metadata and lyrics that should generate those codes.
+    Lower perplexity = model is less surprised = better quality generation
+    Score = -perplexity (higher is better)
+    The understanding task format is:
+    Input: <|audio_code_123|><|audio_code_456|>...
+    Output: <think>\nmetadata_yaml\n</think>\n\n# Lyric\nlyrics_text
+    Args:
+        llm_handler: LLM handler instance with initialized model
+        audio_codes: Generated audio code string (e.g., "<|audio_code_123|><|audio_code_456|>...")
+        caption: Caption text used for generation
+        lyrics: Lyrics text used for generation
+        metadata: Dictionary with CoT metadata fields (bpm, duration, keyscale, language, timesignature, etc.)
+        temperature: Temperature for probability scaling (default 1.0)
+    Returns:
+        Tuple of (perplexity_value, status_message)
+    Example:
+        metadata = {'bpm': 120, 'duration': 30, 'keyscale': 'C major', 'language': 'en', 'timesignature': '4'}
+        perplexity, status = calculate_perplexity(
+            llm_handler,
+            audio_codes="<|audio_code_123|>...",
+            caption="calm piano",
+            lyrics="verse 1...",
+            metadata=metadata
+        )
+        score = -perplexity  # Higher score = better quality
+    """
+    if not llm_handler.llm_initialized:
+        return float('inf'), "❌ LLM not initialized"
+    if not audio_codes or not audio_codes.strip():
+        return float('inf'), "❌ No audio codes provided"
+    try:
+        # Build the understanding prompt: codes as input
+        # The model should generate: <think>metadata</think>\n# Lyric\n...
+        formatted_prompt = llm_handler.build_formatted_prompt_for_understanding(
+            audio_codes=audio_codes,
+            is_negative_prompt=False
+        )
+        logger.info(f"Calculating perplexity for {len(audio_codes)} character audio codes")
+        # Build the expected output (target sequence) following understanding task format
+        # Format: <think>\nmetadata_yaml\n</think>\n\n# Lyric\nlyrics_text
+        target_parts = []
+        # Build CoT section with metadata
+        if metadata and isinstance(metadata, dict):
+            # Filter out None values and format as YAML (sorted keys)
+            cot_items = {}
+            for key in ['bpm', 'caption', 'duration', 'genres', 'keyscale', 'language', 'timesignature']:
+                if key in metadata and metadata[key] is not None:
+                    cot_items[key] = metadata[key]
+            if cot_items:
+                cot_yaml = yaml.dump(cot_items, allow_unicode=True, sort_keys=True).strip()
+                target_parts.append(f"<think>\n{cot_yaml}\n</think>\n")
+        # Add Lyric section (note: understanding task uses "# Lyric" not "# Caption")
+        if lyrics:
+            target_parts.append(f"\n# Lyric\n{lyrics}\n")
+        target_text = "".join(target_parts)
+        if not target_text.strip():
+            return float('inf'), "❌ No target text to evaluate (lyrics or metadata required)"
+        logger.debug(f"Target text (first 200 chars): {target_text[:200]}...")
+        # Calculate perplexity using appropriate backend
+        if llm_handler.llm_backend == "vllm":
+            perplexity = _calculate_perplexity_vllm(
+                llm_handler,
+                formatted_prompt,
+                target_text,
+                temperature
+            )
+        else:  # pt backend
+            perplexity = _calculate_perplexity_pt(
+                llm_handler,
+                formatted_prompt,
+                target_text,
+                temperature
+            )
+        status_msg = f"✅ Perplexity calculated: {perplexity:.4f}"
+        logger.info(status_msg)
+        return perplexity, status_msg
+    except Exception as e:
+        error_msg = f"❌ Error calculating perplexity: {str(e)}"
+        logger.error(error_msg)
+        import traceback
+        logger.error(traceback.format_exc())
+        return float('inf'), error_msg
+def _calculate_perplexity_pt(
+    llm_handler,
+    formatted_prompt: str,
+    target_text: str,
+    temperature: float
+) -> float:
+    """
+    Calculate perplexity using PyTorch backend.
+    For vllm backend, this uses a shared-weight HuggingFace model.
+    For pt backend, this uses the original model.
+    Args:
+        llm_handler: LLM handler with pt or vllm backend
+        formatted_prompt: Formatted input prompt (audio codes)
+        target_text: Expected output text (CoT metadata + lyrics)
+        temperature: Temperature for probability scaling
+    Returns:
+        Perplexity value
+    """
+    # Get model for scoring (handles both pt and vllm backends)
+    model = llm_handler.get_hf_model_for_scoring()
+    tokenizer = llm_handler.llm_tokenizer
+    device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
+    # Tokenize prompt and target separately
+    prompt_tokens = tokenizer(
+        formatted_prompt,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+    )
+    target_tokens = tokenizer(
+        target_text,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+    )
+    # Concatenate prompt + target for full sequence
+    full_input_ids = torch.cat([
+        prompt_tokens['input_ids'],
+        target_tokens['input_ids']
+    ], dim=1).to(device)
+    # Create attention mask
+    attention_mask = torch.ones_like(full_input_ids)
+    # Forward pass to get logits
+    with torch.no_grad():
+        with llm_handler._load_model_context():
+            outputs = model(
+                input_ids=full_input_ids,
+                attention_mask=attention_mask
+            )
+            logits = outputs.logits  # [batch_size, seq_len, vocab_size]
+    # Get the logits for predicting target tokens
+    # Shift logits and labels: logits[i] predicts token[i+1]
+    prompt_len = prompt_tokens['input_ids'].shape[1]
+    target_len = target_tokens['input_ids'].shape[1]
+    # Extract logits for positions that predict target tokens
+    # logits at positions [prompt_len-1 : prompt_len+target_len-1] predict target tokens
+    pred_logits = logits[0, prompt_len-1:prompt_len+target_len-1, :]  # [target_len, vocab_size]
+    target_ids = target_tokens['input_ids'][0]  # [target_len]
+    # Apply temperature scaling
+    if temperature != 1.0:
+        pred_logits = pred_logits / temperature
+    # Calculate cross-entropy loss for each position
+    log_probs = F.log_softmax(pred_logits, dim=-1)  # [target_len, vocab_size]
+    # Gather log probabilities of target tokens
+    target_log_probs = log_probs[torch.arange(target_len), target_ids]  # [target_len]
+    # Calculate perplexity: exp(-mean(log_probs))
+    mean_neg_log_prob = -target_log_probs.mean()
+    perplexity = torch.exp(mean_neg_log_prob).item()
+    return perplexity
+def _calculate_perplexity_vllm(
+    llm_handler,
+    formatted_prompt: str,
+    target_text: str,
+    temperature: float
+) -> float:
+    """
+    Calculate perplexity using vllm backend.
+    Uses shared-weight HuggingFace model for perplexity calculation.
+    This avoids the complexity of nanovllm's context management.
+    Args:
+        llm_handler: LLM handler with vllm backend
+        formatted_prompt: Formatted input prompt (audio codes)
+        target_text: Expected output text (CoT metadata + lyrics)
+        temperature: Temperature for probability scaling
+    Returns:
+        Perplexity value
+    """
+    logger.debug("Using vllm backend with shared-weight HuggingFace model for perplexity")
+    # Delegate to pt backend implementation which now handles both backends
+    return _calculate_perplexity_pt(llm_handler, formatted_prompt, target_text, temperature)