ChuxiJ commited on
Commit
ba7469b
·
1 Parent(s): 51dc2aa

clean codes & refact lm gen & add examples

Browse files
Files changed (43) hide show
  1. .gitignore +2 -1
  2. acestep/acestep_v15_pipeline.py +1 -1
  3. acestep/constrained_logits_processor.py +143 -15
  4. acestep/gradio_ui.py +304 -170
  5. acestep/llm_inference.py +23 -431
  6. examples/text2music/example_01.json +5 -0
  7. examples/text2music/example_02.json +5 -0
  8. examples/text2music/example_03.json +5 -0
  9. examples/text2music/example_04.json +5 -0
  10. examples/text2music/example_05.json +5 -0
  11. examples/text2music/example_06.json +5 -0
  12. examples/text2music/example_07.json +5 -0
  13. examples/text2music/example_08.json +5 -0
  14. examples/text2music/example_09.json +5 -0
  15. examples/text2music/example_10.json +5 -0
  16. examples/text2music/example_11.json +5 -0
  17. examples/text2music/example_12.json +5 -0
  18. examples/text2music/example_13.json +5 -0
  19. examples/text2music/example_14.json +5 -0
  20. examples/text2music/example_15.json +5 -0
  21. examples/text2music/example_16.json +5 -0
  22. examples/text2music/example_17.json +5 -0
  23. examples/text2music/example_18.json +5 -0
  24. examples/text2music/example_19.json +5 -0
  25. examples/text2music/example_20.json +5 -0
  26. examples/text2music/example_21.json +5 -0
  27. examples/text2music/example_22.json +5 -0
  28. examples/text2music/example_23.json +5 -0
  29. examples/text2music/example_24.json +5 -0
  30. examples/text2music/example_25.json +5 -0
  31. examples/text2music/example_26.json +5 -0
  32. examples/text2music/example_27.json +5 -0
  33. examples/text2music/example_28.json +5 -0
  34. examples/text2music/example_29.json +5 -0
  35. examples/text2music/example_30.json +5 -0
  36. examples/text2music/example_31.json +5 -0
  37. examples/text2music/example_32.json +5 -0
  38. examples/text2music/example_33.json +5 -0
  39. examples/text2music/example_34.json +5 -0
  40. examples/text2music/example_35.json +5 -0
  41. examples/text2music/example_36.json +5 -0
  42. examples/text2music/example_37.json +5 -0
  43. test.py +0 -212
.gitignore CHANGED
@@ -219,4 +219,5 @@ README_old.md
219
  discord_bot/
220
  feishu_bot/
221
  tmp*
222
- torchinductor_root/
 
 
219
  discord_bot/
220
  feishu_bot/
221
  tmp*
222
+ torchinductor_root/
223
+ scripts/
acestep/acestep_v15_pipeline.py CHANGED
@@ -70,7 +70,7 @@ def main():
70
  # Service initialization arguments
71
  parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
72
  parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
73
- parser.add_argument("--config_path", type=str, default=None, help="Main model path (e.g., 'acestep-v15-turbo')")
74
  parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"], help="Processing device (default: auto)")
75
  parser.add_argument("--init_llm", type=lambda x: x.lower() in ['true', '1', 'yes'], default=True, help="Initialize 5Hz LM (default: True)")
76
  parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
 
70
  # Service initialization arguments
71
  parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
72
  parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
73
+ parser.add_argument("--config_path", type=str, default=None, help="Main model path (e.g., 'acestep-v15-turbo-rl')")
74
  parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"], help="Processing device (default: auto)")
75
  parser.add_argument("--init_llm", type=lambda x: x.lower() in ['true', '1', 'yes'], default=True, help="Initialize 5Hz LM (default: True)")
76
  parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
acestep/constrained_logits_processor.py CHANGED
@@ -105,6 +105,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
105
  self.target_codes: Optional[int] = None # Computed target codes count
106
  self.codes_count: int = 0 # Counter for generated codes
107
 
 
 
 
108
  # Current state
109
  self.state = FSMState.THINK_TAG
110
  self.position_in_state = 0 # Position within current state's fixed string
@@ -266,6 +269,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
266
  self.skip_genres = skip
267
  self._build_state_transitions()
268
 
 
 
 
 
 
 
 
 
 
 
269
  def set_user_metadata(self, metadata: Optional[Dict[str, Optional[str]]] = None):
270
  """
271
  Set user-provided metadata fields. Fields that are provided will be used directly
@@ -957,29 +970,73 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
957
  """
958
  Get the token IDs that can continue the fixed string from current position.
959
  Returns list of allowed token IDs.
 
 
 
960
  """
961
  remaining = fixed_str[self.position_in_state:]
962
  if not remaining:
963
  return []
964
 
965
- # Try to find tokens that match the beginning of remaining string
966
- allowed = []
 
 
 
 
 
967
 
968
- # Try encoding progressively longer prefixes
969
- for end in range(1, len(remaining) + 1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  prefix = remaining[:end]
971
  tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
972
  if tokens:
973
- # The first token that matches is valid
974
- allowed.append(tokens[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
 
976
- # Also check single character encoding
977
- first_char = remaining[0]
978
- char_tokens = self.tokenizer.encode(first_char, add_special_tokens=False)
979
- if char_tokens:
980
- allowed.extend(char_tokens)
981
 
982
- return list(set(allowed))
983
 
984
  def _get_allowed_digit_tokens(self, min_val: int, max_val: int) -> List[int]:
985
  """
@@ -1271,8 +1328,28 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1271
 
1272
  if self.state in self.fixed_strings:
1273
  # Fixed string state: force specific tokens
1274
- allowed = self._get_allowed_tokens_for_fixed_string(self.fixed_strings[self.state])
 
 
1275
  if allowed:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1276
  for t in allowed:
1277
  mask[0, t] = 0
1278
  # Apply mask
@@ -1283,6 +1360,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1283
  # This will be done in update_state() after token selection
1284
  else:
1285
  # Position exceeds string, move to next state
 
 
 
 
 
 
 
 
 
 
 
1286
  old_state = self.state
1287
  self._transition_to_next_state()
1288
  # Avoid infinite recursion: if we're still in a fixed_strings state, just return scores
@@ -1351,7 +1439,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1351
  # All digits generated, force newline
1352
  if self.newline_token:
1353
  mask[0, self.newline_token] = 0
1354
- self._transition_to_next_state()
1355
 
1356
  scores = scores + mask
1357
  else:
@@ -1487,7 +1574,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1487
  """Transition to the next FSM state."""
1488
  if self.state in self.next_state:
1489
  old_state = self.state
1490
- self.state = self.next_state[self.state]
 
 
 
 
 
 
 
 
 
1491
  self.position_in_state = 0
1492
  self.accumulated_value = "" # Legacy, kept for compatibility
1493
  self.accumulated_token_ids = [] # Reset token ID sequence for new field
@@ -1566,8 +1662,26 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1566
  elif self.state in [FSMState.BPM_VALUE, FSMState.DURATION_VALUE, FSMState.TIMESIG_VALUE]:
1567
  # Accumulate numeric value using token ID sequence
1568
  if generated_token_id == self.newline_token:
 
 
 
 
 
 
 
 
 
 
1569
  # Newline ends the field
 
 
1570
  self._transition_to_next_state()
 
 
 
 
 
 
1571
  else:
1572
  # Add token ID to sequence (for prefix tree lookup)
1573
  self.accumulated_token_ids.append(generated_token_id)
@@ -1577,14 +1691,28 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
1577
 
1578
  elif self.state == FSMState.GENRES_VALUE:
1579
  if generated_token_id == self.newline_token:
 
1580
  self._transition_to_next_state()
 
 
 
 
 
 
1581
  else:
1582
  # Genres still uses string-based trie, so keep accumulated_value
1583
  self.accumulated_value += token_str
1584
 
1585
  elif self.state == FSMState.KEYSCALE_VALUE:
1586
  if generated_token_id == self.newline_token:
 
1587
  self._transition_to_next_state()
 
 
 
 
 
 
1588
  else:
1589
  # Add token ID to sequence (for prefix tree lookup)
1590
  self.accumulated_token_ids.append(generated_token_id)
 
105
  self.target_codes: Optional[int] = None # Computed target codes count
106
  self.codes_count: int = 0 # Counter for generated codes
107
 
108
+ # Stop at reasoning flag - if True, stop generation after </think> tag
109
+ self.stop_at_reasoning: bool = False
110
+
111
  # Current state
112
  self.state = FSMState.THINK_TAG
113
  self.position_in_state = 0 # Position within current state's fixed string
 
269
  self.skip_genres = skip
270
  self._build_state_transitions()
271
 
272
+ def set_stop_at_reasoning(self, stop: bool):
273
+ """
274
+ Set whether to stop generation after </think> tag.
275
+
276
+ Args:
277
+ stop: If True, generation will stop immediately after </think> tag is generated.
278
+ If False, generation continues to codes generation phase.
279
+ """
280
+ self.stop_at_reasoning = stop
281
+
282
  def set_user_metadata(self, metadata: Optional[Dict[str, Optional[str]]] = None):
283
  """
284
  Set user-provided metadata fields. Fields that are provided will be used directly
 
970
  """
971
  Get the token IDs that can continue the fixed string from current position.
972
  Returns list of allowed token IDs.
973
+
974
+ Strategy: Find the longest prefix that encodes to a single token, and return that token.
975
+ This ensures we generate by tokens, not character-by-character.
976
  """
977
  remaining = fixed_str[self.position_in_state:]
978
  if not remaining:
979
  return []
980
 
981
+ if self.debug:
982
+ logger.debug(f"_get_allowed_tokens_for_fixed_string: fixed_str={repr(fixed_str)}, position_in_state={self.position_in_state}, remaining={repr(remaining)}")
983
+
984
+ # Try encoding progressively longer prefixes, from longest to shortest
985
+ # We want to find the longest prefix that encodes to a single token
986
+ best_token = None
987
+ best_prefix_len = 0
988
 
989
+ # First pass: find the longest prefix that encodes to exactly one token
990
+ for end in range(len(remaining), 0, -1): # Start from longest prefix
991
+ prefix = remaining[:end]
992
+ tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
993
+ if tokens and len(tokens) == 1:
994
+ # Found a prefix that encodes to a single token
995
+ # Use this one (longest match)
996
+ best_token = tokens[0]
997
+ best_prefix_len = end
998
+ if self.debug:
999
+ logger.debug(f"Found single-token match: prefix={repr(prefix)}, token_id={best_token}, token_text={repr(self.tokenizer.decode([best_token]))}")
1000
+ break
1001
+
1002
+ # If we found a single-token match, return it (this is the preferred case)
1003
+ if best_token is not None:
1004
+ return [best_token]
1005
+
1006
+ # Fallback: if no single-token match found, collect all possible first tokens
1007
+ # This handles edge cases where the string might need multiple tokens
1008
+ # But we still want to prefer longer matches
1009
+ # IMPORTANT: Only consider tokens that actually match the beginning of remaining string
1010
+ # Decode each candidate token and verify it matches the prefix
1011
+ allowed_tokens = {}
1012
+ for end in range(1, min(len(remaining) + 1, 20)): # Limit search to avoid too many iterations
1013
  prefix = remaining[:end]
1014
  tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
1015
  if tokens:
1016
+ first_token = tokens[0]
1017
+ # Verify: decode the token and check it matches the prefix start
1018
+ decoded_token = self.tokenizer.decode([first_token])
1019
+ # Normalize both for comparison (strip and lower)
1020
+ normalized_prefix = prefix.lstrip().lower()
1021
+ normalized_decoded = decoded_token.lstrip().lower()
1022
+
1023
+ # Check if decoded token matches the prefix start (allowing for space prefixes)
1024
+ if normalized_decoded.startswith(normalized_prefix) or normalized_prefix.startswith(normalized_decoded):
1025
+ # Store the longest prefix length for each token
1026
+ if first_token not in allowed_tokens or end > allowed_tokens[first_token]:
1027
+ allowed_tokens[first_token] = end
1028
+
1029
+ # Return tokens sorted by prefix length (longest first)
1030
+ # This ensures we prefer longer matches
1031
+ sorted_tokens = sorted(allowed_tokens.items(), key=lambda x: x[1], reverse=True)
1032
+ result = [token for token, _ in sorted_tokens] if sorted_tokens else []
1033
 
1034
+ if self.debug:
1035
+ logger.debug(f"Fallback: returning {len(result)} tokens: {[(t, repr(self.tokenizer.decode([t]))) for t in result[:5]]}")
1036
+ if result:
1037
+ logger.debug(f"Fixed string: {repr(fixed_str)}, position: {self.position_in_state}, remaining: {repr(remaining)}")
 
1038
 
1039
+ return result
1040
 
1041
  def _get_allowed_digit_tokens(self, min_val: int, max_val: int) -> List[int]:
1042
  """
 
1328
 
1329
  if self.state in self.fixed_strings:
1330
  # Fixed string state: force specific tokens
1331
+ fixed_str = self.fixed_strings[self.state]
1332
+ allowed = self._get_allowed_tokens_for_fixed_string(fixed_str)
1333
+
1334
  if allowed:
1335
+ # Check if we should stop at reasoning (after </think> tag)
1336
+ # This happens when we're about to complete the </think> tag
1337
+ if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
1338
+ # Check if the next token would complete the fixed string
1339
+ # We check if position_in_state + length of next token would complete it
1340
+ # Since we don't know which token will be selected, we check if we're close to completion
1341
+ # Actually, a better approach: check if this is the last character(s) of the fixed string
1342
+ remaining_chars = len(fixed_str) - self.position_in_state
1343
+ # If remaining is small (<= 10 chars, which is typically 1-2 tokens), force EOS
1344
+ if remaining_chars <= 10:
1345
+ # Force EOS token to stop generation
1346
+ if self.eos_token_id is not None:
1347
+ mask[0, self.eos_token_id] = 0
1348
+ scores = scores + mask
1349
+ if self.debug:
1350
+ logger.debug(f"stop_at_reasoning=True: forcing EOS near end of </think> tag (remaining: {remaining_chars} chars)")
1351
+ return scores
1352
+
1353
  for t in allowed:
1354
  mask[0, t] = 0
1355
  # Apply mask
 
1360
  # This will be done in update_state() after token selection
1361
  else:
1362
  # Position exceeds string, move to next state
1363
+ # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
1364
+ # force EOS before transitioning
1365
+ if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
1366
+ # Force EOS token to stop generation
1367
+ if self.eos_token_id is not None:
1368
+ mask[0, self.eos_token_id] = 0
1369
+ scores = scores + mask
1370
+ if self.debug:
1371
+ logger.debug(f"stop_at_reasoning=True: forcing EOS after completing </think> tag")
1372
+ return scores
1373
+
1374
  old_state = self.state
1375
  self._transition_to_next_state()
1376
  # Avoid infinite recursion: if we're still in a fixed_strings state, just return scores
 
1439
  # All digits generated, force newline
1440
  if self.newline_token:
1441
  mask[0, self.newline_token] = 0
 
1442
 
1443
  scores = scores + mask
1444
  else:
 
1574
  """Transition to the next FSM state."""
1575
  if self.state in self.next_state:
1576
  old_state = self.state
1577
+ next_state = self.next_state[self.state]
1578
+
1579
+ # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
1580
+ # skip CODES_GENERATION and go directly to COMPLETED
1581
+ if self.stop_at_reasoning and old_state == FSMState.THINK_END_TAG:
1582
+ next_state = FSMState.COMPLETED
1583
+ if self.debug:
1584
+ logger.debug(f"stop_at_reasoning=True: skipping CODES_GENERATION, going directly to COMPLETED")
1585
+
1586
+ self.state = next_state
1587
  self.position_in_state = 0
1588
  self.accumulated_value = "" # Legacy, kept for compatibility
1589
  self.accumulated_token_ids = [] # Reset token ID sequence for new field
 
1662
  elif self.state in [FSMState.BPM_VALUE, FSMState.DURATION_VALUE, FSMState.TIMESIG_VALUE]:
1663
  # Accumulate numeric value using token ID sequence
1664
  if generated_token_id == self.newline_token:
1665
+ if self.state == FSMState.DURATION_VALUE and self.accumulated_value:
1666
+ try:
1667
+ generated_duration = int(self.accumulated_value)
1668
+ if self.target_codes is None and generated_duration > 0:
1669
+ self.target_codes = int(generated_duration * 5)
1670
+ if self.debug:
1671
+ logger.debug(f"Synced duration: {generated_duration}s -> Set target_codes limit to {self.target_codes}")
1672
+ except ValueError:
1673
+ if self.debug:
1674
+ logger.warning(f"Could not parse duration value: {self.accumulated_value}")
1675
  # Newline ends the field
1676
+ # Save old state before transition
1677
+ old_state = self.state
1678
  self._transition_to_next_state()
1679
+ # IMPORTANT: After state transition, if new state is a fixed_strings state,
1680
+ # we should NOT update position_in_state with the newline token length,
1681
+ # because that token belongs to the old state, not the new state.
1682
+ # Return early to avoid the fixed_strings update logic below.
1683
+ if self.state in self.fixed_strings:
1684
+ return
1685
  else:
1686
  # Add token ID to sequence (for prefix tree lookup)
1687
  self.accumulated_token_ids.append(generated_token_id)
 
1691
 
1692
  elif self.state == FSMState.GENRES_VALUE:
1693
  if generated_token_id == self.newline_token:
1694
+ # Newline ends the field
1695
  self._transition_to_next_state()
1696
+ # IMPORTANT: After state transition, if new state is a fixed_strings state,
1697
+ # we should NOT update position_in_state with the newline token length,
1698
+ # because that token belongs to the old state, not the new state.
1699
+ # Return early to avoid the fixed_strings update logic below.
1700
+ if self.state in self.fixed_strings:
1701
+ return
1702
  else:
1703
  # Genres still uses string-based trie, so keep accumulated_value
1704
  self.accumulated_value += token_str
1705
 
1706
  elif self.state == FSMState.KEYSCALE_VALUE:
1707
  if generated_token_id == self.newline_token:
1708
+ # Newline ends the field
1709
  self._transition_to_next_state()
1710
+ # IMPORTANT: After state transition, if new state is a fixed_strings state,
1711
+ # we should NOT update position_in_state with the newline token length,
1712
+ # because that token belongs to the old state, not the new state.
1713
+ # Return early to avoid the fixed_strings update logic below.
1714
+ if self.state in self.fixed_strings:
1715
+ return
1716
  else:
1717
  # Add token ID to sequence (for prefix tree lookup)
1718
  self.accumulated_token_ids.append(generated_token_id)
acestep/gradio_ui.py CHANGED
@@ -3,8 +3,11 @@ Gradio UI Components Module
3
  Contains all Gradio interface component definitions and layouts
4
  """
5
  import os
 
 
 
6
  import gradio as gr
7
- from typing import Callable, Optional
8
 
9
 
10
  def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None) -> gr.Blocks:
@@ -77,9 +80,7 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_para
77
 
78
  def create_dataset_section(dataset_handler) -> dict:
79
  """Create dataset explorer section"""
80
- with gr.Group():
81
- gr.HTML('<div class="section-header"><h3>📊 Dataset Explorer</h3></div>')
82
-
83
  with gr.Row(equal_height=True):
84
  dataset_type = gr.Dropdown(
85
  choices=["train", "test"],
@@ -355,70 +356,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
355
  )
356
 
357
  # Audio Codes for text2music
358
- with gr.Accordion("🎼 Audio Codes (for text2music)", open=True, visible=True) as text2music_audio_codes_group:
359
- with gr.Row(equal_height=True, elem_classes=["lm-hints-row"]):
360
- with gr.Column(scale=9):
361
- text2music_audio_code_string = gr.Textbox(
362
- label="Audio Codes",
363
- placeholder="<|audio_code_10695|><|audio_code_54246|>...",
364
- lines=6,
365
- info="Paste precomputed audio code tokens for text2music generation"
366
- )
367
- with gr.Column(scale=3, elem_classes=["lm-hints-col"]):
368
- with gr.Row(equal_height=True, visible=True) as use_5hz_lm_row:
369
- use_5hz_lm_btn = gr.Button(
370
- "Generate LM Hints",
371
- variant="secondary",
372
- # size="lg",
373
- elem_classes=["lm-hints-btn"],
374
- )
375
-
376
- with gr.Row(equal_height=True):
377
- lm_temperature = gr.Slider(
378
- label="Temperature",
379
- minimum=0.0,
380
- maximum=2.0,
381
- value=0.85,
382
- step=0.1,
383
- scale=1,
384
- info="5Hz LM temperature (higher = random)"
385
- )
386
- lm_cfg_scale = gr.Slider(
387
- label="CFG Scale",
388
- minimum=1.0,
389
- maximum=3.0,
390
- value=2.0,
391
- step=0.1,
392
- scale=1,
393
- info="5Hz LM CFG (1.0 = no CFG)"
394
- )
395
- lm_top_k = gr.Slider(
396
- label="Top-K",
397
- minimum=0,
398
- maximum=100,
399
- value=0,
400
- step=1,
401
- scale=1,
402
- info="Top-K (0 = disabled)"
403
- )
404
- lm_top_p = gr.Slider(
405
- label="Top-P",
406
- minimum=0.0,
407
- maximum=1.0,
408
- value=0.9,
409
- step=0.01,
410
- scale=1,
411
- info="Top-P (1.0 = disabled)"
412
- )
413
- lm_repetition_penalty = gr.Slider(
414
- label="Repetition Penalty",
415
- minimum=0.8,
416
- maximum=1.2,
417
- value=1.0,
418
- step=0.01,
419
- scale=1,
420
- info="Repetition penalty: >1.0 reduces repetition, <1.0 increases it. Use 1.0 or very small values for audio tokens.",
421
- visible=False,
422
  )
423
 
424
  # Repainting controls
@@ -436,17 +379,6 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
436
  minimum=-1,
437
  step=0.1,
438
  )
439
-
440
- # Audio Cover Strength
441
- audio_cover_strength = gr.Slider(
442
- minimum=0.0,
443
- maximum=1.0,
444
- value=1.0,
445
- step=0.01,
446
- label="LM codes strength",
447
- info="Control how many denoising steps use LM-generated codes",
448
- visible=True
449
- )
450
 
451
  # Music Caption
452
  with gr.Accordion("📝 Music Caption", open=True):
@@ -456,17 +388,13 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
456
  placeholder="A peaceful acoustic guitar melody with soft vocals...",
457
  lines=3,
458
  info="Describe the style, genre, instruments, and mood",
459
- scale=7,
460
  )
461
- # Negative prompt for CFG (only visible when LM initialized and cfg_scale > 1)
462
- lm_negative_prompt = gr.Textbox(
463
- label="Negative Prompt",
464
- value="NO USER INPUT",
465
- placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
466
- visible=True,
467
- info="Negative prompt (use when CFG Scale > 1.0)",
468
- lines=3,
469
- scale=5,
470
  )
471
 
472
  # Lyrics
@@ -502,7 +430,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
502
  )
503
  time_signature = gr.Dropdown(
504
  choices=["2", "3", "4", "N/A", ""],
505
- value="4",
506
  label="Time Signature (optional)",
507
  allow_custom_value=True,
508
  info="2/4, 3/4, 4/4..."
@@ -532,7 +460,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
532
  maximum=8,
533
  value=8,
534
  step=1,
535
- label="Inference Steps",
536
  info="Turbo: max 8, Base: max 100"
537
  )
538
  guidance_scale = gr.Slider(
@@ -540,7 +468,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
540
  maximum=15.0,
541
  value=7.0,
542
  step=0.1,
543
- label="Guidance Scale",
544
  info="Higher values follow text more closely",
545
  visible=False
546
  )
@@ -589,17 +517,84 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
589
  info="Audio format for saved files"
590
  )
591
 
 
 
592
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  output_alignment_preference = gr.Checkbox(
594
  label="Output Attention Focus Score (disabled)",
595
  value=False,
596
  info="Output attention focus score analysis",
597
- interactive=False
 
598
  )
599
 
600
  # Set generate_btn to interactive if service is pre-initialized
601
  generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
602
- generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive)
 
 
 
 
 
 
 
603
 
604
  return {
605
  "service_config_accordion": service_config_accordion,
@@ -625,19 +620,17 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
625
  "convert_src_to_codes_btn": convert_src_to_codes_btn,
626
  "text2music_audio_code_string": text2music_audio_code_string,
627
  "text2music_audio_codes_group": text2music_audio_codes_group,
628
- "use_5hz_lm_row": use_5hz_lm_row,
629
- "use_5hz_lm_btn": use_5hz_lm_btn,
630
  "lm_temperature": lm_temperature,
631
  "lm_cfg_scale": lm_cfg_scale,
632
  "lm_top_k": lm_top_k,
633
  "lm_top_p": lm_top_p,
634
- "lm_repetition_penalty": lm_repetition_penalty,
635
  "lm_negative_prompt": lm_negative_prompt,
636
  "repainting_group": repainting_group,
637
  "repainting_start": repainting_start,
638
  "repainting_end": repainting_end,
639
  "audio_cover_strength": audio_cover_strength,
640
  "captions": captions,
 
641
  "lyrics": lyrics,
642
  "vocal_language": vocal_language,
643
  "bpm": bpm,
@@ -654,6 +647,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
654
  "cfg_interval_end": cfg_interval_end,
655
  "audio_format": audio_format,
656
  "output_alignment_preference": output_alignment_preference,
 
657
  "generate_btn": generate_btn,
658
  }
659
 
@@ -728,6 +722,72 @@ def create_results_section(dit_handler) -> dict:
728
  def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section):
729
  """Setup event handlers connecting UI components and business logic"""
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  def update_init_status(status_msg, enable_btn):
732
  """Update initialization status and enable/disable generate button"""
733
  return status_msg, gr.update(interactive=enable_btn)
@@ -904,16 +964,101 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
904
  text2music_audio_code_string, repainting_start, repainting_end,
905
  instruction_display_gen, audio_cover_strength, task_type,
906
  use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
 
907
  progress=gr.Progress(track_tqdm=True)
908
  ):
909
- return dit_handler.generate_music(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
  captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
911
  time_signature=time_signature, vocal_language=vocal_language,
912
  inference_steps=inference_steps, guidance_scale=guidance_scale,
913
  use_random_seed=random_seed_checkbox, seed=seed,
914
  reference_audio=reference_audio, audio_duration=audio_duration,
915
  batch_size=batch_size_input, src_audio=src_audio,
916
- audio_code_string=text2music_audio_code_string,
917
  repainting_start=repainting_start, repainting_end=repainting_end,
918
  instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
919
  task_type=task_type, use_adg=use_adg,
@@ -921,6 +1066,47 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
921
  audio_format=audio_format, lm_temperature=lm_temperature,
922
  progress=progress
923
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
924
 
925
  generation_section["generate_btn"].click(
926
  fn=generate_with_progress,
@@ -949,7 +1135,12 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
949
  generation_section["cfg_interval_start"],
950
  generation_section["cfg_interval_end"],
951
  generation_section["audio_format"],
952
- generation_section["lm_temperature"]
 
 
 
 
 
953
  ],
954
  outputs=[
955
  results_section["generated_audio_1"],
@@ -963,70 +1154,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
963
  results_section["align_plot_1"],
964
  results_section["align_score_2"],
965
  results_section["align_text_2"],
966
- results_section["align_plot_2"]
967
- ]
968
- )
969
-
970
- # 5Hz LM generation (simplified version, can be extended as needed)
971
- def generate_lm_hints_wrapper(caption, lyrics, temperature, cfg_scale, top_k, top_p, repetition_penalty, negative_prompt):
972
- """Wrapper for 5Hz LM generation"""
973
- # Convert top_k: 0 means None (disabled)
974
- top_k_value = None if top_k == 0 else int(top_k)
975
- # Convert top_p: 1.0 means None (disabled)
976
- top_p_value = None if top_p >= 1.0 else top_p
977
- metadata, audio_codes, status = llm_handler.generate_with_5hz_lm(
978
- caption, lyrics, temperature, cfg_scale, negative_prompt,
979
- top_k_value, top_p_value, repetition_penalty
980
- )
981
-
982
- # Extract metadata values and map to UI fields
983
- # Handle bpm
984
- bpm_value = metadata.get('bpm', None)
985
- if bpm_value == "N/A" or bpm_value == "":
986
- bpm_value = None
987
-
988
- # Handle key_scale (metadata uses 'keyscale')
989
- key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
990
- if key_scale_value == "N/A":
991
- key_scale_value = ""
992
-
993
- # Handle time_signature (metadata uses 'timesignature')
994
- time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
995
- if time_signature_value == "N/A":
996
- time_signature_value = ""
997
-
998
- # Handle audio_duration (metadata uses 'duration')
999
- audio_duration_value = metadata.get('duration', -1)
1000
- if audio_duration_value == "N/A" or audio_duration_value == "":
1001
- audio_duration_value = -1
1002
-
1003
- # Return audio codes and all metadata fields
1004
- return (
1005
- audio_codes, # text2music_audio_code_string
1006
- bpm_value, # bpm
1007
- key_scale_value, # key_scale
1008
- time_signature_value, # time_signature
1009
- audio_duration_value, # audio_duration
1010
- )
1011
-
1012
- generation_section["use_5hz_lm_btn"].click(
1013
- fn=generate_lm_hints_wrapper,
1014
- inputs=[
1015
- generation_section["captions"],
1016
- generation_section["lyrics"],
1017
- generation_section["lm_temperature"],
1018
- generation_section["lm_cfg_scale"],
1019
- generation_section["lm_top_k"],
1020
- generation_section["lm_top_p"],
1021
- generation_section["lm_repetition_penalty"],
1022
- generation_section["lm_negative_prompt"]
1023
- ],
1024
- outputs=[
1025
- generation_section["text2music_audio_code_string"],
1026
- generation_section["bpm"],
1027
- generation_section["key_scale"],
1028
- generation_section["time_signature"],
1029
- generation_section["audio_duration"],
1030
  ]
1031
  )
1032
 
@@ -1072,8 +1201,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1072
  audio_cover_strength_info = "Control how many denoising steps use cover mode"
1073
  # Show repainting controls for repaint and lego
1074
  repainting_visible = task_type_value in ["repaint", "lego"]
1075
- # Show use_5hz_lm, lm_temperature for text2music
1076
- use_5hz_lm_visible = task_type_value == "text2music"
1077
  # Show text2music_audio_codes if task is text2music OR if it has content
1078
  # This allows it to stay visible even if user switches task type but has codes
1079
  has_audio_codes = audio_codes_content and str(audio_codes_content).strip()
@@ -1085,7 +1212,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1085
  gr.update(visible=complete_visible), # complete_track_classes
1086
  gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info), # audio_cover_strength
1087
  gr.update(visible=repainting_visible), # repainting_group
1088
- gr.update(visible=use_5hz_lm_visible), # use_5hz_lm_row
1089
  gr.update(visible=text2music_audio_codes_visible), # text2music_audio_codes_group
1090
  )
1091
 
@@ -1105,7 +1231,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1105
  generation_section["complete_track_classes"],
1106
  generation_section["audio_cover_strength"],
1107
  generation_section["repainting_group"],
1108
- generation_section["use_5hz_lm_row"],
1109
  generation_section["text2music_audio_codes_group"],
1110
  ]
1111
  )
@@ -1126,7 +1251,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1126
  generation_section["complete_track_classes"],
1127
  generation_section["audio_cover_strength"],
1128
  generation_section["repainting_group"],
1129
- generation_section["use_5hz_lm_row"],
1130
  generation_section["text2music_audio_codes_group"],
1131
  ]
1132
  )
@@ -1147,7 +1271,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1147
  generation_section["complete_track_classes"],
1148
  generation_section["audio_cover_strength"],
1149
  generation_section["repainting_group"],
1150
- generation_section["use_5hz_lm_row"],
1151
  generation_section["text2music_audio_codes_group"],
1152
  ]
1153
  )
@@ -1171,6 +1294,17 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
1171
  outputs=[generation_section["src_audio"]]
1172
  )
1173
 
 
 
 
 
 
 
 
 
 
 
 
1174
  # Auto-expand Audio Uploads accordion when audio is uploaded
1175
  def update_audio_uploads_accordion(reference_audio, src_audio):
1176
  """Update Audio Uploads accordion open state based on whether audio files are present"""
 
3
  Contains all Gradio interface component definitions and layouts
4
  """
5
  import os
6
+ import json
7
+ import random
8
+ import glob
9
  import gradio as gr
10
+ from typing import Callable, Optional, Tuple
11
 
12
 
13
  def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None) -> gr.Blocks:
 
80
 
81
  def create_dataset_section(dataset_handler) -> dict:
82
  """Create dataset explorer section"""
83
+ with gr.Accordion("📊 Dataset Explorer", open=False):
 
 
84
  with gr.Row(equal_height=True):
85
  dataset_type = gr.Dropdown(
86
  choices=["train", "test"],
 
356
  )
357
 
358
  # Audio Codes for text2music
359
+ with gr.Accordion("🎼 LM Codes Hints", open=False, visible=True) as text2music_audio_codes_group:
360
+ text2music_audio_code_string = gr.Textbox(
361
+ label="LM Codes Hints",
362
+ placeholder="<|audio_code_10695|><|audio_code_54246|>...",
363
+ lines=6,
364
+ info="Paste LM codes hints for text2music generation"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  )
366
 
367
  # Repainting controls
 
379
  minimum=-1,
380
  step=0.1,
381
  )
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  # Music Caption
384
  with gr.Accordion("📝 Music Caption", open=True):
 
388
  placeholder="A peaceful acoustic guitar melody with soft vocals...",
389
  lines=3,
390
  info="Describe the style, genre, instruments, and mood",
391
+ scale=9,
392
  )
393
+ sample_btn = gr.Button(
394
+ "Sample",
395
+ variant="secondary",
396
+ size="sm",
397
+ scale=1,
 
 
 
 
398
  )
399
 
400
  # Lyrics
 
430
  )
431
  time_signature = gr.Dropdown(
432
  choices=["2", "3", "4", "N/A", ""],
433
+ value="",
434
  label="Time Signature (optional)",
435
  allow_custom_value=True,
436
  info="2/4, 3/4, 4/4..."
 
460
  maximum=8,
461
  value=8,
462
  step=1,
463
+ label="DiT Inference Steps",
464
  info="Turbo: max 8, Base: max 100"
465
  )
466
  guidance_scale = gr.Slider(
 
468
  maximum=15.0,
469
  value=7.0,
470
  step=0.1,
471
+ label="DiT Guidance Scale (Only support for base model)",
472
  info="Higher values follow text more closely",
473
  visible=False
474
  )
 
517
  info="Audio format for saved files"
518
  )
519
 
520
+ # LM (Language Model) Parameters
521
+ gr.HTML("<h4>🤖 LM Generation Parameters</h4>")
522
  with gr.Row():
523
+ lm_temperature = gr.Slider(
524
+ label="LM Temperature",
525
+ minimum=0.0,
526
+ maximum=2.0,
527
+ value=0.85,
528
+ step=0.1,
529
+ scale=1,
530
+ info="5Hz LM temperature (higher = more random)"
531
+ )
532
+ lm_cfg_scale = gr.Slider(
533
+ label="LM CFG Scale",
534
+ minimum=1.0,
535
+ maximum=3.0,
536
+ value=2.0,
537
+ step=0.1,
538
+ scale=1,
539
+ info="5Hz LM CFG (1.0 = no CFG)"
540
+ )
541
+ lm_top_k = gr.Slider(
542
+ label="LM Top-K",
543
+ minimum=0,
544
+ maximum=100,
545
+ value=0,
546
+ step=1,
547
+ scale=1,
548
+ info="Top-K (0 = disabled)"
549
+ )
550
+ lm_top_p = gr.Slider(
551
+ label="LM Top-P",
552
+ minimum=0.0,
553
+ maximum=1.0,
554
+ value=0.9,
555
+ step=0.01,
556
+ scale=1,
557
+ info="Top-P (1.0 = disabled)"
558
+ )
559
+
560
+ with gr.Row():
561
+ lm_negative_prompt = gr.Textbox(
562
+ label="LM Negative Prompt",
563
+ value="NO USER INPUT",
564
+ placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
565
+ info="Negative prompt (use when LM CFG Scale > 1.0)",
566
+ lines=2,
567
+ scale=2,
568
+ )
569
+
570
+ with gr.Row():
571
+ audio_cover_strength = gr.Slider(
572
+ minimum=0.0,
573
+ maximum=1.0,
574
+ value=1.0,
575
+ step=0.01,
576
+ label="LM Codes Strength",
577
+ info="Control how many denoising steps use LM-generated codes",
578
+ scale=1,
579
+ )
580
  output_alignment_preference = gr.Checkbox(
581
  label="Output Attention Focus Score (disabled)",
582
  value=False,
583
  info="Output attention focus score analysis",
584
+ interactive=False,
585
+ scale=1,
586
  )
587
 
588
  # Set generate_btn to interactive if service is pre-initialized
589
  generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
590
+ with gr.Row(equal_height=True):
591
+ think_checkbox = gr.Checkbox(
592
+ label="Think",
593
+ value=True,
594
+ info="Enable llm generate hints",
595
+ scale=1,
596
+ )
597
+ generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive, scale=10)
598
 
599
  return {
600
  "service_config_accordion": service_config_accordion,
 
620
  "convert_src_to_codes_btn": convert_src_to_codes_btn,
621
  "text2music_audio_code_string": text2music_audio_code_string,
622
  "text2music_audio_codes_group": text2music_audio_codes_group,
 
 
623
  "lm_temperature": lm_temperature,
624
  "lm_cfg_scale": lm_cfg_scale,
625
  "lm_top_k": lm_top_k,
626
  "lm_top_p": lm_top_p,
 
627
  "lm_negative_prompt": lm_negative_prompt,
628
  "repainting_group": repainting_group,
629
  "repainting_start": repainting_start,
630
  "repainting_end": repainting_end,
631
  "audio_cover_strength": audio_cover_strength,
632
  "captions": captions,
633
+ "sample_btn": sample_btn,
634
  "lyrics": lyrics,
635
  "vocal_language": vocal_language,
636
  "bpm": bpm,
 
647
  "cfg_interval_end": cfg_interval_end,
648
  "audio_format": audio_format,
649
  "output_alignment_preference": output_alignment_preference,
650
+ "think_checkbox": think_checkbox,
651
  "generate_btn": generate_btn,
652
  }
653
 
 
722
  def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section):
723
  """Setup event handlers connecting UI components and business logic"""
724
 
725
+ def load_random_example(task_type: str):
726
+ """Load a random example from the task-specific examples directory
727
+
728
+ Args:
729
+ task_type: The task type (e.g., "text2music")
730
+
731
+ Returns:
732
+ Tuple of (caption_value, lyrics_value, think_value) for updating UI components
733
+ """
734
+ try:
735
+ # Get the project root directory
736
+ current_file = os.path.abspath(__file__)
737
+ project_root = os.path.dirname(os.path.dirname(current_file))
738
+
739
+ # Construct the examples directory path
740
+ examples_dir = os.path.join(project_root, "examples", task_type)
741
+
742
+ # Check if directory exists
743
+ if not os.path.exists(examples_dir):
744
+ gr.Warning(f"Examples directory not found: examples/{task_type}/")
745
+ return "", "", True
746
+
747
+ # Find all JSON files in the directory
748
+ json_files = glob.glob(os.path.join(examples_dir, "*.json"))
749
+
750
+ if not json_files:
751
+ gr.Warning(f"No JSON files found in examples/{task_type}/")
752
+ return "", "", True
753
+
754
+ # Randomly select one file
755
+ selected_file = random.choice(json_files)
756
+
757
+ # Read and parse JSON
758
+ try:
759
+ with open(selected_file, 'r', encoding='utf-8') as f:
760
+ data = json.load(f)
761
+
762
+ # Extract caption (prefer 'caption', fallback to 'prompt')
763
+ caption_value = data.get('caption', data.get('prompt', ''))
764
+ if not isinstance(caption_value, str):
765
+ caption_value = str(caption_value) if caption_value else ''
766
+
767
+ # Extract lyrics
768
+ lyrics_value = data.get('lyrics', '')
769
+ if not isinstance(lyrics_value, str):
770
+ lyrics_value = str(lyrics_value) if lyrics_value else ''
771
+
772
+ # Extract think (default to True if not present)
773
+ think_value = data.get('think', True)
774
+ if not isinstance(think_value, bool):
775
+ think_value = True
776
+
777
+ gr.Info(f"Loaded example from {os.path.basename(selected_file)}")
778
+ return caption_value, lyrics_value, think_value
779
+
780
+ except json.JSONDecodeError as e:
781
+ gr.Warning(f"Failed to parse JSON file {os.path.basename(selected_file)}: {str(e)}")
782
+ return "", "", True
783
+ except Exception as e:
784
+ gr.Warning(f"Error reading file {os.path.basename(selected_file)}: {str(e)}")
785
+ return "", "", True
786
+
787
+ except Exception as e:
788
+ gr.Warning(f"Error loading example: {str(e)}")
789
+ return "", "", True
790
+
791
  def update_init_status(status_msg, enable_btn):
792
  """Update initialization status and enable/disable generate button"""
793
  return status_msg, gr.update(interactive=enable_btn)
 
964
  text2music_audio_code_string, repainting_start, repainting_end,
965
  instruction_display_gen, audio_cover_strength, task_type,
966
  use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
967
+ think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
968
  progress=gr.Progress(track_tqdm=True)
969
  ):
970
+ # If think is enabled (llm_dit mode), generate audio codes using LM first
971
+ audio_code_string_to_use = text2music_audio_code_string
972
+ lm_generated_metadata = None # Store LM-generated metadata for display
973
+ lm_generated_audio_codes = None # Store LM-generated audio codes for display
974
+ if think_checkbox and llm_handler.llm_initialized:
975
+ # Convert top_k: 0 means None (disabled)
976
+ top_k_value = None if lm_top_k == 0 else int(lm_top_k)
977
+ # Convert top_p: 1.0 means None (disabled)
978
+ top_p_value = None if lm_top_p >= 1.0 else lm_top_p
979
+
980
+ # Build user_metadata from user-provided values (only include non-empty values)
981
+ user_metadata = {}
982
+ # Handle bpm: gr.Number can be None, int, float, or string
983
+ if bpm is not None:
984
+ try:
985
+ bpm_value = float(bpm)
986
+ if bpm_value > 0:
987
+ user_metadata['bpm'] = str(int(bpm_value))
988
+ except (ValueError, TypeError):
989
+ # If bpm is not a valid number, skip it
990
+ pass
991
+ if key_scale and key_scale.strip():
992
+ key_scale_clean = key_scale.strip()
993
+ if key_scale_clean.lower() not in ["n/a", ""]:
994
+ user_metadata['keyscale'] = key_scale_clean
995
+ if time_signature and time_signature.strip():
996
+ time_sig_clean = time_signature.strip()
997
+ if time_sig_clean.lower() not in ["n/a", ""]:
998
+ user_metadata['timesignature'] = time_sig_clean
999
+ if audio_duration is not None:
1000
+ try:
1001
+ duration_value = float(audio_duration)
1002
+ if duration_value > 0:
1003
+ user_metadata['duration'] = str(int(duration_value))
1004
+ except (ValueError, TypeError):
1005
+ # If audio_duration is not a valid number, skip it
1006
+ pass
1007
+
1008
+ # Only pass user_metadata if user provided any values, otherwise let LM generate
1009
+ user_metadata_to_pass = user_metadata if user_metadata else None
1010
+
1011
+ # Generate using llm_dit mode (infer_type='llm_dit')
1012
+ metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
1013
+ caption=captions or "",
1014
+ lyrics=lyrics or "",
1015
+ infer_type="llm_dit",
1016
+ temperature=lm_temperature,
1017
+ cfg_scale=lm_cfg_scale,
1018
+ negative_prompt=lm_negative_prompt,
1019
+ top_k=top_k_value,
1020
+ top_p=top_p_value,
1021
+ user_metadata=user_metadata_to_pass,
1022
+ )
1023
+
1024
+ # Store LM-generated metadata and audio codes for display
1025
+ lm_generated_metadata = metadata
1026
+ if audio_codes:
1027
+ audio_code_string_to_use = audio_codes
1028
+ lm_generated_audio_codes = audio_codes
1029
+ # Update metadata fields only if they are empty/None (user didn't provide them)
1030
+ if bpm is None and metadata.get('bpm'):
1031
+ bpm_value = metadata.get('bpm')
1032
+ if bpm_value != "N/A" and bpm_value != "":
1033
+ try:
1034
+ bpm = int(bpm_value)
1035
+ except:
1036
+ pass
1037
+ if not key_scale and metadata.get('keyscale'):
1038
+ key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
1039
+ if key_scale_value != "N/A":
1040
+ key_scale = key_scale_value
1041
+ if not time_signature and metadata.get('timesignature'):
1042
+ time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
1043
+ if time_signature_value != "N/A":
1044
+ time_signature = time_signature_value
1045
+ if audio_duration is None or audio_duration <= 0:
1046
+ audio_duration_value = metadata.get('duration', -1)
1047
+ if audio_duration_value != "N/A" and audio_duration_value != "":
1048
+ try:
1049
+ audio_duration = float(audio_duration_value)
1050
+ except:
1051
+ pass
1052
+
1053
+ # Call generate_music and get results
1054
+ result = dit_handler.generate_music(
1055
  captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
1056
  time_signature=time_signature, vocal_language=vocal_language,
1057
  inference_steps=inference_steps, guidance_scale=guidance_scale,
1058
  use_random_seed=random_seed_checkbox, seed=seed,
1059
  reference_audio=reference_audio, audio_duration=audio_duration,
1060
  batch_size=batch_size_input, src_audio=src_audio,
1061
+ audio_code_string=audio_code_string_to_use,
1062
  repainting_start=repainting_start, repainting_end=repainting_end,
1063
  instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
1064
  task_type=task_type, use_adg=use_adg,
 
1066
  audio_format=audio_format, lm_temperature=lm_temperature,
1067
  progress=progress
1068
  )
1069
+
1070
+ # Extract results
1071
+ first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
1072
+ align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
1073
+
1074
+ # Append LM-generated metadata to generation_info if available
1075
+ if lm_generated_metadata:
1076
+ metadata_lines = []
1077
+ if lm_generated_metadata.get('bpm'):
1078
+ metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
1079
+ if lm_generated_metadata.get('keyscale'):
1080
+ metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
1081
+ if lm_generated_metadata.get('timesignature'):
1082
+ metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
1083
+ if lm_generated_metadata.get('duration'):
1084
+ metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
1085
+ if lm_generated_metadata.get('genres'):
1086
+ metadata_lines.append(f"- **Genres:** {lm_generated_metadata['genres']}")
1087
+
1088
+ if metadata_lines:
1089
+ metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
1090
+ generation_info = metadata_section + "\n\n" + generation_info
1091
+
1092
+ # Update audio codes in UI if LM generated them
1093
+ updated_audio_codes = lm_generated_audio_codes if lm_generated_audio_codes else text2music_audio_code_string
1094
+
1095
+ return (
1096
+ first_audio,
1097
+ second_audio,
1098
+ all_audio_paths,
1099
+ generation_info,
1100
+ status_message,
1101
+ seed_value_for_ui,
1102
+ align_score_1,
1103
+ align_text_1,
1104
+ align_plot_1,
1105
+ align_score_2,
1106
+ align_text_2,
1107
+ align_plot_2,
1108
+ updated_audio_codes # Update audio codes in UI
1109
+ )
1110
 
1111
  generation_section["generate_btn"].click(
1112
  fn=generate_with_progress,
 
1135
  generation_section["cfg_interval_start"],
1136
  generation_section["cfg_interval_end"],
1137
  generation_section["audio_format"],
1138
+ generation_section["lm_temperature"],
1139
+ generation_section["think_checkbox"],
1140
+ generation_section["lm_cfg_scale"],
1141
+ generation_section["lm_top_k"],
1142
+ generation_section["lm_top_p"],
1143
+ generation_section["lm_negative_prompt"]
1144
  ],
1145
  outputs=[
1146
  results_section["generated_audio_1"],
 
1154
  results_section["align_plot_1"],
1155
  results_section["align_score_2"],
1156
  results_section["align_text_2"],
1157
+ results_section["align_plot_2"],
1158
+ generation_section["text2music_audio_code_string"] # Update audio codes display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
  ]
1160
  )
1161
 
 
1201
  audio_cover_strength_info = "Control how many denoising steps use cover mode"
1202
  # Show repainting controls for repaint and lego
1203
  repainting_visible = task_type_value in ["repaint", "lego"]
 
 
1204
  # Show text2music_audio_codes if task is text2music OR if it has content
1205
  # This allows it to stay visible even if user switches task type but has codes
1206
  has_audio_codes = audio_codes_content and str(audio_codes_content).strip()
 
1212
  gr.update(visible=complete_visible), # complete_track_classes
1213
  gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info), # audio_cover_strength
1214
  gr.update(visible=repainting_visible), # repainting_group
 
1215
  gr.update(visible=text2music_audio_codes_visible), # text2music_audio_codes_group
1216
  )
1217
 
 
1231
  generation_section["complete_track_classes"],
1232
  generation_section["audio_cover_strength"],
1233
  generation_section["repainting_group"],
 
1234
  generation_section["text2music_audio_codes_group"],
1235
  ]
1236
  )
 
1251
  generation_section["complete_track_classes"],
1252
  generation_section["audio_cover_strength"],
1253
  generation_section["repainting_group"],
 
1254
  generation_section["text2music_audio_codes_group"],
1255
  ]
1256
  )
 
1271
  generation_section["complete_track_classes"],
1272
  generation_section["audio_cover_strength"],
1273
  generation_section["repainting_group"],
 
1274
  generation_section["text2music_audio_codes_group"],
1275
  ]
1276
  )
 
1294
  outputs=[generation_section["src_audio"]]
1295
  )
1296
 
1297
+ # Sample button - load random example
1298
+ generation_section["sample_btn"].click(
1299
+ fn=load_random_example,
1300
+ inputs=[generation_section["task_type"]],
1301
+ outputs=[
1302
+ generation_section["captions"],
1303
+ generation_section["lyrics"],
1304
+ generation_section["think_checkbox"]
1305
+ ]
1306
+ )
1307
+
1308
  # Auto-expand Audio Uploads accordion when audio is uploaded
1309
  def update_audio_uploads_accordion(reference_audio, src_audio):
1310
  """Update Audio Uploads accordion open state based on whether audio files are present"""
acestep/llm_inference.py CHANGED
@@ -9,14 +9,12 @@ from typing import Optional, Dict, Any, Tuple, List
9
  from contextlib import contextmanager
10
 
11
  import torch
12
- from tqdm import tqdm
13
  from loguru import logger
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
  from transformers.generation.streamers import BaseStreamer
16
  from transformers.generation.logits_process import (
17
  LogitsProcessorList,
18
  RepetitionPenaltyLogitsProcessor,
19
- LogitsProcessor,
20
  )
21
  from .constrained_logits_processor import MetadataConstrainedLogitsProcessor
22
 
@@ -229,114 +227,7 @@ class LLMHandler:
229
  self.llm_initialized = False
230
  error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
231
  return error_msg
232
-
233
- def generate_with_5hz_lm_vllm(
234
- self,
235
- caption: str,
236
- lyrics: str,
237
- temperature: float = 0.6,
238
- cfg_scale: float = 1.0,
239
- negative_prompt: str = "NO USER INPUT",
240
- top_k: Optional[int] = None,
241
- top_p: Optional[float] = None,
242
- repetition_penalty: float = 1.0,
243
- use_constrained_decoding: bool = True,
244
- constrained_decoding_debug: bool = False,
245
- metadata_temperature: Optional[float] = 0.85,
246
- codes_temperature: Optional[float] = None,
247
- target_duration: Optional[float] = None,
248
- user_metadata: Optional[Dict[str, Optional[str]]] = None,
249
- ) -> Tuple[Dict[str, Any], str, str]:
250
- """Generate metadata and audio codes using 5Hz LM with vllm backend
251
-
252
- Args:
253
- caption: Text caption for music generation
254
- lyrics: Lyrics for music generation
255
- temperature: Base sampling temperature (used if phase-specific temps not set)
256
- cfg_scale: CFG scale (>1.0 enables CFG)
257
- negative_prompt: Negative prompt for CFG
258
- top_k: Top-k sampling parameter
259
- top_p: Top-p (nucleus) sampling parameter
260
- repetition_penalty: Repetition penalty
261
- use_constrained_decoding: Whether to use FSM-based constrained decoding
262
- constrained_decoding_debug: Whether to print debug info for constrained decoding
263
- metadata_temperature: Temperature for metadata generation (lower = more accurate)
264
- If None, uses base temperature
265
- codes_temperature: Temperature for audio codes generation (higher = more diverse)
266
- If None, uses base temperature
267
- target_duration: Target duration in seconds for codes generation constraint.
268
- 5 codes = 1 second. If specified, blocks EOS until target reached.
269
- """
270
- try:
271
- from nanovllm import SamplingParams
272
-
273
- formatted_prompt = self.build_formatted_prompt(caption, lyrics)
274
- logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
275
-
276
- # Determine effective temperature for sampler
277
- # If using phase-specific temperatures, set sampler temp to 1.0 (processor handles it)
278
- use_phase_temperatures = metadata_temperature is not None or codes_temperature is not None
279
- effective_sampler_temp = 1.0 if use_phase_temperatures else temperature
280
-
281
- # Use shared constrained decoding processor if enabled
282
- constrained_processor = None
283
- update_state_fn = None
284
- if use_constrained_decoding or use_phase_temperatures:
285
- # Use shared processor, just update caption and settings
286
- self.constrained_processor.enabled = use_constrained_decoding
287
- self.constrained_processor.debug = constrained_decoding_debug
288
- self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
289
- self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
290
- self.constrained_processor.update_caption(caption)
291
- self.constrained_processor.set_target_duration(target_duration)
292
- # Always call set_user_metadata to ensure previous settings are cleared if None
293
- self.constrained_processor.set_user_metadata(user_metadata)
294
-
295
- constrained_processor = self.constrained_processor
296
- update_state_fn = constrained_processor.update_state
297
-
298
- sampling_params = SamplingParams(
299
- max_tokens=self.max_model_len-64,
300
- temperature=effective_sampler_temp,
301
- cfg_scale=cfg_scale,
302
- top_k=top_k,
303
- top_p=top_p,
304
- repetition_penalty=repetition_penalty,
305
- logits_processor=constrained_processor,
306
- logits_processor_update_state=update_state_fn,
307
- )
308
- # Use CFG if cfg_scale > 1.0
309
- if cfg_scale > 1.0:
310
- # Build unconditional prompt (user input replaced with "NO USER INPUT")
311
- formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
312
- outputs = self.llm.generate(
313
- [formatted_prompt],
314
- sampling_params,
315
- unconditional_prompts=[formatted_unconditional_prompt]
316
- )
317
- else:
318
- outputs = self.llm.generate([formatted_prompt], sampling_params)
319
- # Extract text from output - handle different output formats
320
- if isinstance(outputs, list) and len(outputs) > 0:
321
- if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
322
- output_text = outputs[0].outputs[0].text
323
- elif hasattr(outputs[0], 'text'):
324
- output_text = outputs[0].text
325
- elif isinstance(outputs[0], dict) and 'text' in outputs[0]:
326
- output_text = outputs[0]['text']
327
- else:
328
- output_text = str(outputs[0])
329
- else:
330
- output_text = str(outputs)
331
- metadata, audio_codes = self.parse_lm_output(output_text)
332
- print(f"[debug]output_text: {output_text}")
333
- codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
334
- return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
335
-
336
- except Exception as e:
337
- error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
338
- return {}, "", error_msg
339
-
340
  def _run_vllm_from_formatted(
341
  self,
342
  formatted_prompt: str,
@@ -352,6 +243,7 @@ class LLMHandler:
352
  codes_temperature: Optional[float] = None,
353
  target_duration: Optional[float] = None,
354
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
 
355
  ) -> str:
356
  """Shared vllm path: accept prebuilt formatted prompt and return text."""
357
  from nanovllm import SamplingParams
@@ -372,6 +264,7 @@ class LLMHandler:
372
  self.constrained_processor.set_target_duration(target_duration)
373
  # Always call set_user_metadata to ensure previous settings are cleared if None
374
  self.constrained_processor.set_user_metadata(user_metadata)
 
375
 
376
  constrained_processor = self.constrained_processor
377
 
@@ -410,213 +303,7 @@ class LLMHandler:
410
  output_text = str(outputs)
411
 
412
  return output_text
413
-
414
- def generate_with_5hz_lm_pt(
415
- self,
416
- caption: str,
417
- lyrics: str,
418
- temperature: float = 0.6,
419
- cfg_scale: float = 1.0,
420
- negative_prompt: str = "NO USER INPUT",
421
- top_k: Optional[int] = None,
422
- top_p: Optional[float] = None,
423
- repetition_penalty: float = 1.0,
424
- use_constrained_decoding: bool = True,
425
- constrained_decoding_debug: bool = False,
426
- metadata_temperature: Optional[float] = 0.85,
427
- codes_temperature: Optional[float] = None,
428
- target_duration: Optional[float] = None,
429
- user_metadata: Optional[Dict[str, Optional[str]]] = None,
430
- ) -> Tuple[Dict[str, Any], str, str]:
431
- """Generate metadata and audio codes using 5Hz LM with PyTorch backend
432
-
433
- Args:
434
- caption: Text caption for music generation
435
- lyrics: Lyrics for music generation
436
- temperature: Base sampling temperature (used if phase-specific temps not set)
437
- cfg_scale: CFG scale (>1.0 enables CFG)
438
- negative_prompt: Negative prompt for CFG
439
- top_k: Top-k sampling parameter
440
- top_p: Top-p (nucleus) sampling parameter
441
- repetition_penalty: Repetition penalty
442
- use_constrained_decoding: Whether to use FSM-based constrained decoding
443
- constrained_decoding_debug: Whether to print debug info for constrained decoding
444
- metadata_temperature: Temperature for metadata generation (lower = more accurate)
445
- If None, uses base temperature
446
- codes_temperature: Temperature for audio codes generation (higher = more diverse)
447
- If None, uses base temperature
448
- target_duration: Target duration in seconds for codes generation constraint.
449
- 5 codes = 1 second. If specified, blocks EOS until target reached.
450
- """
451
- try:
452
- formatted_prompt = self.build_formatted_prompt(caption, lyrics)
453
-
454
- # Tokenize the prompt
455
- inputs = self.llm_tokenizer(
456
- formatted_prompt,
457
- return_tensors="pt",
458
- padding=False,
459
- )
460
-
461
- # Generate with the model
462
- with self._load_model_context():
463
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
464
-
465
- # Get max_new_tokens from model config or use a default
466
- max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
467
- if hasattr(self, 'max_model_len'):
468
- max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
469
-
470
- # Define custom streamer for tqdm
471
- class TqdmTokenStreamer(BaseStreamer):
472
- def __init__(self, total):
473
- self.pbar = tqdm(total=total, desc="Generating 5Hz tokens", unit="token", maxinterval=1)
474
-
475
- def put(self, value):
476
- # value is tensor of token ids
477
- if value.dim() > 1:
478
- num_tokens = value.numel()
479
- else:
480
- num_tokens = len(value)
481
- self.pbar.update(num_tokens)
482
-
483
- def end(self):
484
- self.pbar.close()
485
-
486
- streamer = TqdmTokenStreamer(total=max_new_tokens)
487
-
488
- # Determine if using phase-specific temperatures
489
- use_phase_temperatures = metadata_temperature is not None or codes_temperature is not None
490
- effective_temperature = 1.0 if use_phase_temperatures else temperature
491
 
492
- # Use shared constrained decoding processor if enabled
493
- constrained_processor = None
494
- if use_constrained_decoding or use_phase_temperatures:
495
- # Use shared processor, just update caption and settings
496
- self.constrained_processor.enabled = use_constrained_decoding
497
- self.constrained_processor.debug = constrained_decoding_debug
498
- self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
499
- self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
500
- self.constrained_processor.update_caption(caption)
501
- self.constrained_processor.set_target_duration(target_duration)
502
- # Always call set_user_metadata to ensure previous settings are cleared if None
503
- self.constrained_processor.set_user_metadata(user_metadata)
504
-
505
- constrained_processor = self.constrained_processor
506
-
507
- # Build logits processor list (only for CFG and repetition penalty)
508
- logits_processor = LogitsProcessorList()
509
-
510
- # Add repetition penalty if needed (generate() doesn't support it natively in all versions)
511
- if repetition_penalty != 1.0:
512
- logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
513
-
514
- # Handle CFG if cfg_scale > 1.0
515
- if cfg_scale > 1.0:
516
- # Build unconditional prompt
517
- formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
518
-
519
- # Tokenize both prompts together to ensure same length (with left padding)
520
- # Left padding is important for generation tasks
521
- batch_texts = [formatted_prompt, formatted_unconditional_prompt]
522
- original_padding_side = self.llm_tokenizer.padding_side
523
- self.llm_tokenizer.padding_side = 'left'
524
- batch_inputs = self.llm_tokenizer(
525
- batch_texts,
526
- return_tensors="pt",
527
- padding=True,
528
- truncation=True,
529
- )
530
- self.llm_tokenizer.padding_side = original_padding_side
531
- batch_inputs = {k: v.to(self.device) for k, v in batch_inputs.items()}
532
-
533
- # Extract conditional and unconditional inputs
534
- batch_input_ids = batch_inputs['input_ids'] # [2, seq_len]
535
- batch_attention_mask = batch_inputs.get('attention_mask', None)
536
-
537
- # Use custom CFG generation loop
538
- outputs = self._generate_with_cfg_custom(
539
- batch_input_ids=batch_input_ids,
540
- batch_attention_mask=batch_attention_mask,
541
- max_new_tokens=max_new_tokens,
542
- temperature=effective_temperature,
543
- cfg_scale=cfg_scale,
544
- top_k=top_k,
545
- top_p=top_p,
546
- repetition_penalty=repetition_penalty,
547
- pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
548
- streamer=streamer,
549
- constrained_processor=constrained_processor,
550
- )
551
-
552
- # Extract only the conditional output (first in batch)
553
- outputs = outputs[0:1] # Keep only conditional output
554
- elif use_constrained_decoding or use_phase_temperatures:
555
- # Use custom generation loop for constrained decoding or phase temperatures (non-CFG)
556
- input_ids = inputs['input_ids']
557
- attention_mask = inputs.get('attention_mask', None)
558
-
559
- outputs = self._generate_with_constrained_decoding(
560
- input_ids=input_ids,
561
- attention_mask=attention_mask,
562
- max_new_tokens=max_new_tokens,
563
- temperature=effective_temperature,
564
- top_k=top_k,
565
- top_p=top_p,
566
- repetition_penalty=repetition_penalty,
567
- pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
568
- streamer=streamer,
569
- constrained_processor=constrained_processor,
570
- )
571
- else:
572
- # Generate without CFG using native generate() parameters
573
- with torch.no_grad():
574
- outputs = self.llm.generate(
575
- **inputs,
576
- max_new_tokens=max_new_tokens,
577
- temperature=effective_temperature if effective_temperature > 0 else 1.0,
578
- do_sample=True if effective_temperature > 0 else False,
579
- top_k=top_k if top_k is not None and top_k > 0 else None,
580
- top_p=top_p if top_p is not None and 0.0 < top_p < 1.0 else None,
581
- logits_processor=logits_processor if len(logits_processor) > 0 else None,
582
- pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
583
- streamer=streamer,
584
- )
585
-
586
- # Decode the generated tokens
587
- # outputs is a tensor with shape [batch_size, seq_len], extract first sequence
588
- if isinstance(outputs, torch.Tensor):
589
- if outputs.dim() == 2:
590
- generated_ids = outputs[0]
591
- else:
592
- generated_ids = outputs
593
- else:
594
- generated_ids = outputs[0]
595
-
596
- # Only decode the newly generated tokens (skip the input prompt)
597
- # Use the correct input length based on whether CFG was used
598
- if cfg_scale > 1.0:
599
- # In CFG case, use batch_inputs length (both sequences have same length due to padding)
600
- input_length = batch_inputs['input_ids'].shape[1]
601
- else:
602
- input_length = inputs['input_ids'].shape[1]
603
- generated_ids = generated_ids[input_length:]
604
-
605
- # Move to CPU for decoding
606
- if generated_ids.is_cuda:
607
- generated_ids = generated_ids.cpu()
608
-
609
- output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
610
-
611
- metadata, audio_codes = self.parse_lm_output(output_text)
612
- codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
613
- return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
614
-
615
- except Exception as e:
616
- error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
617
- logger.error(error_msg)
618
- return {}, "", error_msg
619
-
620
  def _run_pt_from_formatted(
621
  self,
622
  formatted_prompt: str,
@@ -630,6 +317,7 @@ class LLMHandler:
630
  constrained_decoding_debug: bool = False,
631
  target_duration: Optional[float] = None,
632
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
 
633
  ) -> str:
634
  """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
635
  inputs = self.llm_tokenizer(
@@ -649,6 +337,7 @@ class LLMHandler:
649
  self.constrained_processor.set_target_duration(target_duration)
650
  # Always call set_user_metadata to ensure previous settings are cleared if None
651
  self.constrained_processor.set_user_metadata(user_metadata)
 
652
 
653
  constrained_processor = self.constrained_processor
654
 
@@ -759,99 +448,13 @@ class LLMHandler:
759
 
760
  output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
761
  return output_text
762
-
763
- def generate_with_5hz_lm(
764
- self,
765
- caption: str,
766
- lyrics: str,
767
- temperature: float = 0.6,
768
- cfg_scale: float = 1.0,
769
- negative_prompt: str = "NO USER INPUT",
770
- top_k: Optional[int] = None,
771
- top_p: Optional[float] = None,
772
- repetition_penalty: float = 1.0,
773
- use_constrained_decoding: bool = True,
774
- constrained_decoding_debug: bool = False,
775
- metadata_temperature: Optional[float] = 0.85,
776
- codes_temperature: Optional[float] = None,
777
- target_duration: Optional[float] = None,
778
- user_metadata: Optional[Dict[str, Optional[str]]] = None,
779
- ) -> Tuple[Dict[str, Any], str, str]:
780
- """Generate metadata and audio codes using 5Hz LM
781
-
782
- Args:
783
- caption: Text caption for music generation
784
- lyrics: Lyrics for music generation
785
- temperature: Base sampling temperature (used if phase-specific temps not set)
786
- cfg_scale: CFG scale (>1.0 enables CFG)
787
- negative_prompt: Negative prompt for CFG
788
- top_k: Top-k sampling parameter
789
- top_p: Top-p (nucleus) sampling parameter
790
- repetition_penalty: Repetition penalty
791
- use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
792
- constrained_decoding_debug: Whether to print debug info for constrained decoding
793
- metadata_temperature: Temperature for metadata generation (lower = more accurate)
794
- Recommended: 0.3-0.5 for accurate metadata
795
- codes_temperature: Temperature for audio codes generation (higher = more diverse)
796
- Recommended: 0.7-1.0 for diverse codes
797
- target_duration: Target duration in seconds for codes generation constraint.
798
- 5 codes = 1 second. If specified, blocks EOS until target reached.
799
- """
800
- # Check if 5Hz LM is initialized
801
- if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
802
- debug_info = f"llm_initialized={getattr(self, 'llm_initialized', 'not set')}, "
803
- debug_info += f"has_llm={hasattr(self, 'llm')}, "
804
- debug_info += f"llm_is_none={getattr(self, 'llm', None) is None}, "
805
- debug_info += f"llm_backend={getattr(self, 'llm_backend', 'not set')}"
806
- return {}, "", f"❌ 5Hz LM not initialized. Please initialize it first. Debug: {debug_info}"
807
-
808
- if not hasattr(self, 'llm') or self.llm is None:
809
- return {}, "", "❌ 5Hz LM model not loaded. Please initialize it first."
810
-
811
- if not hasattr(self, 'llm_backend'):
812
- return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
813
-
814
- if self.llm_backend == "vllm":
815
- return self.generate_with_5hz_lm_vllm(
816
- caption=caption,
817
- lyrics=lyrics,
818
- temperature=temperature,
819
- cfg_scale=cfg_scale,
820
- negative_prompt=negative_prompt,
821
- top_k=top_k,
822
- top_p=top_p,
823
- repetition_penalty=repetition_penalty,
824
- use_constrained_decoding=use_constrained_decoding,
825
- constrained_decoding_debug=constrained_decoding_debug,
826
- metadata_temperature=metadata_temperature,
827
- codes_temperature=codes_temperature,
828
- target_duration=target_duration,
829
- user_metadata=user_metadata,
830
- )
831
- else:
832
- return self.generate_with_5hz_lm_pt(
833
- caption=caption,
834
- lyrics=lyrics,
835
- temperature=temperature,
836
- cfg_scale=cfg_scale,
837
- negative_prompt=negative_prompt,
838
- top_k=top_k,
839
- top_p=top_p,
840
- repetition_penalty=repetition_penalty,
841
- use_constrained_decoding=use_constrained_decoding,
842
- constrained_decoding_debug=constrained_decoding_debug,
843
- metadata_temperature=metadata_temperature,
844
- codes_temperature=codes_temperature,
845
- target_duration=target_duration,
846
- user_metadata=user_metadata,
847
- )
848
 
849
  def generate_with_stop_condition(
850
  self,
851
  caption: str,
852
  lyrics: str,
853
  infer_type: str,
854
- temperature: float = 0.6,
855
  cfg_scale: float = 1.0,
856
  negative_prompt: str = "NO USER INPUT",
857
  top_k: Optional[int] = None,
@@ -859,8 +462,6 @@ class LLMHandler:
859
  repetition_penalty: float = 1.0,
860
  use_constrained_decoding: bool = True,
861
  constrained_decoding_debug: bool = False,
862
- metadata_temperature: Optional[float] = 0.85,
863
- codes_temperature: Optional[float] = None,
864
  target_duration: Optional[float] = None,
865
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
866
  ) -> Tuple[Dict[str, Any], str, str]:
@@ -879,28 +480,14 @@ class LLMHandler:
879
  if infer_type not in {"dit", "llm_dit"}:
880
  return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
881
 
882
- if infer_type == "llm_dit":
883
- return self.generate_with_5hz_lm(
884
- caption=caption,
885
- lyrics=lyrics,
886
- temperature=temperature,
887
- cfg_scale=cfg_scale,
888
- negative_prompt=negative_prompt,
889
- top_k=top_k,
890
- top_p=top_p,
891
- repetition_penalty=repetition_penalty,
892
- use_constrained_decoding=use_constrained_decoding,
893
- constrained_decoding_debug=constrained_decoding_debug,
894
- metadata_temperature=metadata_temperature,
895
- codes_temperature=codes_temperature,
896
- target_duration=target_duration,
897
- user_metadata=user_metadata,
898
- )
899
-
900
- # dit: generate and truncate at reasoning end tag
901
  formatted_prompt = self.build_formatted_prompt(caption, lyrics)
 
 
 
 
902
  output_text, status = self.generate_from_formatted_prompt(
903
- formatted_prompt,
904
  cfg={
905
  "temperature": temperature,
906
  "cfg_scale": cfg_scale,
@@ -908,20 +495,22 @@ class LLMHandler:
908
  "top_k": top_k,
909
  "top_p": top_p,
910
  "repetition_penalty": repetition_penalty,
 
911
  "user_metadata": user_metadata,
912
  },
913
  use_constrained_decoding=use_constrained_decoding,
914
  constrained_decoding_debug=constrained_decoding_debug,
 
915
  )
916
  if not output_text:
917
  return {}, "", status
918
 
919
- if self.STOP_REASONING_TAG in output_text:
920
- stop_idx = output_text.find(self.STOP_REASONING_TAG)
921
- output_text = output_text[: stop_idx + len(self.STOP_REASONING_TAG)]
922
 
923
- metadata, _audio_codes = self.parse_lm_output(output_text)
924
- return metadata, "", status
 
925
 
926
  def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False) -> str:
927
  """
@@ -952,6 +541,7 @@ class LLMHandler:
952
  cfg: Optional[Dict[str, Any]] = None,
953
  use_constrained_decoding: bool = True,
954
  constrained_decoding_debug: bool = False,
 
955
  ) -> Tuple[str, str]:
956
  """
957
  Generate raw LM text output from a pre-built formatted prompt.
@@ -966,6 +556,7 @@ class LLMHandler:
966
  - target_duration (float): Target duration in seconds for codes generation
967
  use_constrained_decoding: Whether to use FSM-based constrained decoding
968
  constrained_decoding_debug: Whether to enable debug logging for constrained decoding
 
969
 
970
  Returns:
971
  (output_text, status_message)
@@ -1003,6 +594,7 @@ class LLMHandler:
1003
  constrained_decoding_debug=constrained_decoding_debug,
1004
  target_duration=target_duration,
1005
  user_metadata=user_metadata,
 
1006
  )
1007
  return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
1008
 
@@ -1019,6 +611,7 @@ class LLMHandler:
1019
  constrained_decoding_debug=constrained_decoding_debug,
1020
  target_duration=target_duration,
1021
  user_metadata=user_metadata,
 
1022
  )
1023
  return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
1024
 
@@ -1436,4 +1029,3 @@ class LLMHandler:
1436
  torch.cuda.empty_cache()
1437
  offload_time = time.time() - start_time
1438
  logger.info(f"Offloaded LLM to CPU in {offload_time:.4f}s")
1439
-
 
9
  from contextlib import contextmanager
10
 
11
  import torch
 
12
  from loguru import logger
13
  from transformers import AutoTokenizer, AutoModelForCausalLM
14
  from transformers.generation.streamers import BaseStreamer
15
  from transformers.generation.logits_process import (
16
  LogitsProcessorList,
17
  RepetitionPenaltyLogitsProcessor,
 
18
  )
19
  from .constrained_logits_processor import MetadataConstrainedLogitsProcessor
20
 
 
227
  self.llm_initialized = False
228
  error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
229
  return error_msg
230
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def _run_vllm_from_formatted(
232
  self,
233
  formatted_prompt: str,
 
243
  codes_temperature: Optional[float] = None,
244
  target_duration: Optional[float] = None,
245
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
246
+ stop_at_reasoning: bool = False,
247
  ) -> str:
248
  """Shared vllm path: accept prebuilt formatted prompt and return text."""
249
  from nanovllm import SamplingParams
 
264
  self.constrained_processor.set_target_duration(target_duration)
265
  # Always call set_user_metadata to ensure previous settings are cleared if None
266
  self.constrained_processor.set_user_metadata(user_metadata)
267
+ self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
268
 
269
  constrained_processor = self.constrained_processor
270
 
 
303
  output_text = str(outputs)
304
 
305
  return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def _run_pt_from_formatted(
308
  self,
309
  formatted_prompt: str,
 
317
  constrained_decoding_debug: bool = False,
318
  target_duration: Optional[float] = None,
319
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
320
+ stop_at_reasoning: bool = False,
321
  ) -> str:
322
  """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
323
  inputs = self.llm_tokenizer(
 
337
  self.constrained_processor.set_target_duration(target_duration)
338
  # Always call set_user_metadata to ensure previous settings are cleared if None
339
  self.constrained_processor.set_user_metadata(user_metadata)
340
+ self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
341
 
342
  constrained_processor = self.constrained_processor
343
 
 
448
 
449
  output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
450
  return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  def generate_with_stop_condition(
453
  self,
454
  caption: str,
455
  lyrics: str,
456
  infer_type: str,
457
+ temperature: float = 0.85,
458
  cfg_scale: float = 1.0,
459
  negative_prompt: str = "NO USER INPUT",
460
  top_k: Optional[int] = None,
 
462
  repetition_penalty: float = 1.0,
463
  use_constrained_decoding: bool = True,
464
  constrained_decoding_debug: bool = False,
 
 
465
  target_duration: Optional[float] = None,
466
  user_metadata: Optional[Dict[str, Optional[str]]] = None,
467
  ) -> Tuple[Dict[str, Any], str, str]:
 
480
  if infer_type not in {"dit", "llm_dit"}:
481
  return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
482
 
483
+ # Build formatted prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  formatted_prompt = self.build_formatted_prompt(caption, lyrics)
485
+
486
+ # Determine stop condition
487
+ stop_at_reasoning = (infer_type == "dit")
488
+ # For llm_dit mode: use normal generation (stops at EOS)
489
  output_text, status = self.generate_from_formatted_prompt(
490
+ formatted_prompt=formatted_prompt,
491
  cfg={
492
  "temperature": temperature,
493
  "cfg_scale": cfg_scale,
 
495
  "top_k": top_k,
496
  "top_p": top_p,
497
  "repetition_penalty": repetition_penalty,
498
+ "target_duration": target_duration,
499
  "user_metadata": user_metadata,
500
  },
501
  use_constrained_decoding=use_constrained_decoding,
502
  constrained_decoding_debug=constrained_decoding_debug,
503
+ stop_at_reasoning=stop_at_reasoning,
504
  )
505
  if not output_text:
506
  return {}, "", status
507
 
508
+ # Parse output
509
+ metadata, audio_codes = self.parse_lm_output(output_text)
 
510
 
511
+ codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
512
+ status_msg = f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
513
+ return metadata, audio_codes, status_msg
514
 
515
  def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False) -> str:
516
  """
 
541
  cfg: Optional[Dict[str, Any]] = None,
542
  use_constrained_decoding: bool = True,
543
  constrained_decoding_debug: bool = False,
544
+ stop_at_reasoning: bool = False,
545
  ) -> Tuple[str, str]:
546
  """
547
  Generate raw LM text output from a pre-built formatted prompt.
 
556
  - target_duration (float): Target duration in seconds for codes generation
557
  use_constrained_decoding: Whether to use FSM-based constrained decoding
558
  constrained_decoding_debug: Whether to enable debug logging for constrained decoding
559
+ stop_at_reasoning: If True, stop generation immediately after </think> tag (no audio codes)
560
 
561
  Returns:
562
  (output_text, status_message)
 
594
  constrained_decoding_debug=constrained_decoding_debug,
595
  target_duration=target_duration,
596
  user_metadata=user_metadata,
597
+ stop_at_reasoning=stop_at_reasoning,
598
  )
599
  return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
600
 
 
611
  constrained_decoding_debug=constrained_decoding_debug,
612
  target_duration=target_duration,
613
  user_metadata=user_metadata,
614
+ stop_at_reasoning=stop_at_reasoning,
615
  )
616
  return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
617
 
 
1029
  torch.cuda.empty_cache()
1030
  offload_time = time.time() - start_time
1031
  logger.info(f"Offloaded LLM to CPU in {offload_time:.4f}s")
 
examples/text2music/example_01.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "pop, rap, electronic, blues, hip-house, rhythm and blues",
4
+ "lyrics": "[verse]\n我走过深夜的街道\n冷风吹乱思念的漂亮外套\n你的微笑像星光很炫耀\n照亮了我孤独的每分每秒\n\n[chorus]\n愿你是风吹过我的脸\n带我飞过最远最遥远的山间\n愿你是风轻触我的梦\n停在心头不再飘散无迹无踪\n\n[verse]\n一起在喧哗避开世俗的骚动\n独自在天台探望月色的朦胧\n你说爱像音乐带点重节奏\n一拍一跳让我忘了心的温度多空洞\n\n[bridge]\n唱起对你的想念不隐藏\n像诗又像画写满藏不了的渴望\n你的影子挥不掉像风的倔强\n追着你飞扬穿越云海一样泛光\n\n[chorus]\n愿你是风吹过我的手\n暖暖的触碰像春日细雨温柔\n愿你是风盘绕我的身\n深情万万重不会有一天走远走\n\n[verse]\n深夜的钢琴弹起动人的旋律\n低音鼓砸进心底的每一次呼吸\n要是能将爱化作歌声传递\n你是否会听见我心里的真心实意"
5
+ }
examples/text2music/example_02.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "country rock, folk rock, southern rock, bluegrass, country pop",
4
+ "lyrics": "[verse]\nWoke up to the sunrise glow\nTook my heart and hit the road\nWheels hummin' the only tune I know\nStraight to where the wildflowers grow\n\n[verse]\nGot that old map all wrinkled and torn\nDestination unknown but I'm reborn\nWith a smile that the wind has worn\nChasin' dreams that can't be sworn\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time\n\n[verse]\nMet a girl with a heart of gold\nTold stories that never get old\nHer laugh like a tale that's been told\nA melody so bold yet uncontrolled\n\n[bridge]\nClouds roll by like silent ghosts\nAs we drive along the coast\nWe toast to the days we love the most\nFreedom's song is what we post\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time"
5
+ }
examples/text2music/example_03.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "hip-house, funk",
4
+ "lyrics": "[verse]\n哎呀跳起来,脚尖踩节拍 (oo-yeah!)\n灯光闪烁像星星盛开 (uh-huh!)\n人人都醒来,把烦恼踹开 (get it!)\n热血沸腾,汗水自己安排\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[bridge]\n咚咚鼓声啊,让你的灵魂起飞 (woo!)\n手心拍一拍,能量翻倍 (ah-hah!)\n键盘响起来,如宇宙的交汇 (oh yeah!)\n就是这感觉,兄弟姐妹都陶醉\n\n[verse]\n灵魂从不睡,只想继续燃烧 (woo!)\n节奏像热浪,席卷这街道 (ow!)\n大伙儿涌上楼台,满面微笑 (yeah!)\n这一刻属于我们,无可替代\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[verse]\n世界多精彩,握紧把它打开 (alright!)\n每一步都像星球在摇摆 (uh-huh!)\n无边无际的律动像大海 (oo-yeah!)\n跟着光芒之舞,一起澎湃"
5
+ }
examples/text2music/example_04.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "funk, pop, soul, rock, melodic, guitar, drums, bass, keyboard, percussion, 105 BPM, energetic, upbeat, groovy, vibrant, dynamic",
4
+ "lyrics": "[verse]\nNeon lights they flicker bright\nCity hums in dead of night\nRhythms pulse through concrete veins\nLost in echoes of refrains\n\n[verse]\nBassline groovin' in my chest\nHeartbeats match the city's zest\nElectric whispers fill the air\nSynthesized dreams everywhere\n\n[chorus]\nTurn it up and let it flow\nFeel the fire let it grow\nIn this rhythm we belong\nHear the night sing out our song\n\n[verse]\nGuitar strings they start to weep\nWake the soul from silent sleep\nEvery note a story told\nIn this night we’re bold and gold\n\n[bridge]\nVoices blend in harmony\nLost in pure cacophony\nTimeless echoes timeless cries\nSoulful shouts beneath the skies\n\n[verse]\nKeyboard dances on the keys\nMelodies on evening breeze\nCatch the tune and hold it tight\nIn this moment we take flight"
5
+ }
examples/text2music/example_05.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "electronic rap",
4
+ "lyrics": "[verse]\nWaves on the bass, pulsing in the speakers,\nTurn the dial up, we chasing six-figure features,\nGrinding on the beats, codes in the creases,\nDigital hustler, midnight in sneakers.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb.\n\n[verse]\nSynthesizers blaze, city lights a glow,\nRhythm in the haze, moving with the flow,\nSwagger on stage, energy to blow,\nFrom the blocks to the booth, you already know.\n\n[bridge]\nNight's electric, streets full of dreams,\nBass hits collective, bursting at seams,\nHustle perspective, all in the schemes,\nRise and reflective, ain't no in-betweens.\n\n[verse]\nVibin' with the crew, sync in the wire,\nGot the dance moves, fire in the attire,\nRhythm and blues, soul's our supplier,\nRun the digital zoo, higher and higher.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb."
5
+ }
examples/text2music/example_06.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "electronic, house, electro house, synthesizer, drums, bass, percussion, fast, energetic, uplifting, exciting",
4
+ "lyrics": "[verse]\n霓虹灯下我们追逐\n人群跃动像潮水满布\n热浪袭来吹散孤独\n跳进节奏不如停下脚步\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦\n\n[verse]\n天空翻滚黑云入夜\n每颗星星像音乐律贴\n耳边回响那低音线\n环绕耳际如梦境般甜\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦"
5
+ }
examples/text2music/example_07.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "synth-pop, electronic, pop, synthesizer, drums, bass, piano, 128 BPM, energetic, uplifting, modern",
4
+ "lyrics": "[verse]\nWoke up in a city that's always alive\nNeon lights they shimmer they thrive\nElectric pulses beat they drive\nMy heart races just to survive\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[verse]\nLost in the labyrinth of screens\nVirtual love or so it seems\nIn the night the city gleams\nDigital faces haunted by memes\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[bridge]\nSilent whispers in my ear\nPixelated love serene and clear\nThrough the chaos find you near\nIn electric dreams no fear\n\n[verse]\nBound by circuits intertwined\nLove like ours is hard to find\nIn this world we’re truly blind\nBut electric dreams free the mind"
5
+ }
examples/text2music/example_08.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Cuban music, salsa, son, Afro-Cuban, traditional Cuban",
4
+ "lyrics": "[verse]\nSun dips low the night ignites\nBassline hums with gleaming lights\nElectric guitar singing tales so fine\nIn the rhythm we all intertwine\n\n[verse]\nDrums beat steady calling out\nPercussion guides no room for doubt\nElectric pulse through every vein\nDance away every ounce of pain\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb\n\n[bridge]\nStars above they start to glow\nEchoes of the night's soft glow\nElectric strings weave through the air\nIn this moment none compare\n\n[verse]\nHeartbeats sync with every tone\nLost in music never alone\nElectric tales of love and peace\nIn this groove we find release\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb"
5
+ }
examples/text2music/example_09.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "pop, piano, rap, dark, atmospheric",
4
+ "lyrics": "[verse]\n月光爬上窗 染白冷的床\n心跳的方向 带我入迷惘\n黑夜吞噬光 命运的纸张\n爱是血色霜 邪恶又芬芳\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[verse]\n心锁在门外 谁会解开关怀\n温柔的手拍 藏着冷酷杀害\n思绪如尘埃 撞击爱的霹雳\n灵魂的独白 为你沾满血迹\n\n[bridge]\n你是噩梦的歌唱 是灵魂的捆绑\n绝望中带着光 悬崖边的渴望\n心跳被你鼓掌 恶魔也痴痴想\n渐渐没了抵抗 古老诡计流淌\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[outro]\n爱如月黑无光 渗进梦的战场\n逃入无声的场 放手或心嚷嚷\n隐秘的极端 爱是极致风浪\n灵魂彻底交偿 你是终极虚妄"
5
+ }
examples/text2music/example_10.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "surf music",
4
+ "lyrics": "[verse]\nSunshine on the boulevard the beach is calling loud\nWaves are dancing golden sand under a cotton cloud\nElectric heartbeat pounding fast the tide is on our side\nCatch a wave and feel alive we’ll take it for a ride\n\n[verse]\nPalm trees swaying left to right they know where we belong\nFeel the rhythm of the night it keeps us moving strong\nSea spray kisses salty air we’re flying with the breeze\nChampagne states of mind we ride we do just as we please\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn\n\n[bridge]\nMoonlight paints the ocean blue reflections in our eyes\nStars align to light our path we’re surfing through the skies\nEvery moment like a song we sing it loud and clear\nEvery day’s a new adventure with you always near\n\n[verse]\nNeon lights and city sounds they blend with ocean views\nWe’re unstoppable tonight no way that we can lose\nDreams are written in the sand they sparkle in the sun\nTogether we’re a masterpiece our story’s just begun\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn"
5
+ }
examples/text2music/example_11.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "alternative rock, pop, rock",
4
+ "lyrics": "[verse]\nBright lights flashing in the city sky\nRunning fast and we don't know why\nElectric nights got our hearts on fire\nChasing dreams we'll never tire\n\n[verse]\nGrit in our eyes wind in our hair\nBreaking rules we don't even care\nShouting loud above the crowd\nLiving life like we're unbowed\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[verse]\nPiercing through like a lightning strike\nEvery moment feels like a hike\nDaring bold never backing down\nKings and queens without a crown\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[bridge]\nClose your eyes let your spirit soar\nWe are the ones who wanted more\nBreaking chains of the mundane\nIn this world we'll make our claim"
5
+ }
examples/text2music/example_12.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "rock, hip - hop, orchestral, bass, drums, electric guitar, piano, synthesizer, violin, viola, cello, fast, energetic, motivational, inspirational, empowering",
4
+ "lyrics": "### **[Intro – Spoken]** \n*\"The streets whisper, their echoes never fade. \nEvery step I take leaves a mark—this ain't just a game.\"* \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 1]** \nCold nights, empty pockets, dreams laced with fight, \nEvery loss made me sharper, cut deep like a knife. \nThey said I wouldn’t make it, now they watch in despair, \nFrom the curb to the throne, took the pain, made it rare. \nEvery siren’s a melody, every alley holds a tale, \nRose from the shadows, left my name on the trail. \nStreetlights flicker like warnings in the haze, \nBut I move like a phantom, unfazed by the blaze. \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 2]** \nBarbed wire fences couldn't lock in my mind, \nEvery cage they designed, I left broken behind. \nThey want control, but I’m destined to roam, \nWhere the lost find their voice, where the heart sets the tone. \nSteel and concrete, where the lessons run deep, \nEvery crack in the pavement tells a story of heat. \nBut I rise, undefeated, like a king with no throne, \nWriting scripts in the struggle, my legacy’s stone. \n\n### **[Bridge]** \nFeel the rhythm of the underground roar, \nEvery wound tells a story of the battles before. \nBlood, sweat, and echoes fill the cold midnight, \nBut we move with the fire—unshaken, upright. \n\n### **[Verse 3]** \nNo regrets, no retreat, this game has no pause, \nEvery step that I take is a win for the lost. \nI took lessons from hustlers, wisdom from pain, \nNow the echoes of struggle carve power in my name. \nThey built walls, but I walk through the cracks, \nTurned dirt into gold, never looked back. \nThrough the struggle we rise, through the fire we claim, \nThis is more than just music—it's life in the frame. \n\n### **[Hook/Chorus – Reprise]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Outro – Spoken]** \n*\"The scars, the struggle, the grind—it’s all part of the rhythm. \nWe never break, we never fold. We rise.\"*"
5
+ }
examples/text2music/example_13.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "tango finlandés, campanas, disco, dark pop, electro, guitarra clásica, corridos tumba",
4
+ "lyrics": "[inst]"
5
+ }
examples/text2music/example_14.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Nightclubs, dance parties, workout playlists, radio broadcasts",
4
+ "lyrics": "Burning in motion, set me alight!\nEvery heartbeat turns into a fight!\nCaged in rhythm, chained in time!\nLove’s a battle— You're Mine! You're Mine!"
5
+ }
examples/text2music/example_15.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "melancholic, world, sad, medieval, soulful",
4
+ "lyrics": "[Verse]\nIn a world so grand he roams the skies alone\nHis heart a heavy stone a tale untold\nWhispers of his past echo through the night\nA lonely dragon searching for the light\n\n[Verse 2]\nOnce a mighty force now he drifts in pain\nHis scales once shimmered now they're dark with shame\nCast out by his kin in shadows he does hide\nA haunting sorrow burns deep inside\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day\n\n[Bridge]\nThe world turns cold the nights grow long\nIn his heart he carries an ancient song\nOf battles fought and love long gone\nA legend now but his soul is torn\n\n[Verse 3]\nHoping for a day he'll find a kindred soul\nTo share his pain and make him whole\nTill then he drifts a shadow in the sky\nA lonely dragon with tears in his eye\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day"
5
+ }
examples/text2music/example_16.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted",
4
+ "lyrics": "[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge]\n目を閉じて、くるっと背を向けて、\n何も見なかったフリするから、\n怒らないで… 許してよ…\n\n[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge 2]\n待って、もし私が悪いなら、\nごめんなさいって言うから、\nアイスクリームあげるから、\nもう怒らないで?\n\nOoooh… 言ってよ!"
5
+ }
examples/text2music/example_17.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "dark, death rock, metal, hardcore, electric guitar, powerful, bass, drums, 110 bpm, G major",
4
+ "lyrics": "[Verse]\nMy lovers betray me\nThe snake in my garden is hissing\nIn the air is the sweetness of roses\nAnd under my skin\nThere's a thorn\n\n[Verse 2]\nI should have known\nThat God sends his angel in shadows\nWith blood in his veins\nI watch the enemy\nGivin' me the hand of my savior\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me\n\n[Verse 3]\nWith sugar and spice\nIt's hard to ignore the nostalgia\nWith the men on their knees\nAt the gates of my heart\nHow they beg me\n\n[Verse 4]\nThey say\n\"No one will ever love you\nThe way that I do\nNo one will ever touch you\nThe way that I do\"\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me"
5
+ }
examples/text2music/example_18.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "aggressive, Heavy Riffs, Blast Beats, Satanic Black Metal",
4
+ "lyrics": "[verse]\nFloating through the galaxy on a midnight ride\nStars are dancing all around in cosmic tides\nFeel the pulse of space and time beneath our feet\nEvery beat a heartbeat in this endless suite\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free\n\n[verse]\nPlanetary whispers in the sky tonight\nEvery constellation's got a secret sight\nDistant worlds and moons we have yet to see\nIn the void of space where we can just be\n\n[bridge]\nAsteroids and comets in a ballet they spin\nLost in the rhythm of where our dreams begin\nClose your eyes and let the synths take flight\nWe're voyagers on an electric night\n\n[verse]\nLet the piano keys unlock the stars above\nEvery chord a memory every note is love\nIn this synth symphony we find our grace\nDrifting forever in this boundless space\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free"
5
+ }
examples/text2music/example_19.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "r&b, soul, funk/soul",
4
+ "lyrics": "[verse]\nDancing through electric fires\nHeart is buzzing like live wires\nIn your arms I find desire\nFeel the beat as we get higher\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[verse]\nWhisper secrets that make me blush\nUnder the neon city hush\nYour touch gives me such a rush\nTurn it up we're feeling lush\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[bridge]\nThrough the lights and the smoky haze\nI see you in a thousand ways\nLove's a script and we’re the play\nTurn the page stay till we sway\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why"
5
+ }
examples/text2music/example_20.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Rap, adult, male, spoken word, singing, bright, energetic, clear",
4
+ "lyrics": "[Intro]\n他们说我来自阴影里\n说我的肤色是原罪的印记\n\n[Verse]\n眼神像刀子刮过 穿透我的皮肤\n带着审判和偏见 让我无处可逃处\n你没听过我的故事 没走过我的路\n凭什么就下一个判决 把我划出你的版图\n你说我威胁到你 抢走了你的机会\n可你可知我付出的 是你不敢想象的血泪\n被贴上标签 被区别对待\n呼吸都是错的 只因我生来就不一样态\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Verse]\n每一次努力争取 都会被审视被放大\n每一个细微的错误 都变成攻击的靶\n他们选择性失明 看不见我的汗水\n只看见他们想看的 带着恶意的定位\n系统性的歧视 像一张无形的网\n把我困在原地 无法自由地翱翔\n他们在享受特权 却指责我的贫困\n嘲笑我的口音 我的名字 我的出身\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Bridge]\n我不想寻求同情 只想被公平对待\n不想被定义被束缚 有选择自己未来的权利\n什么时候 才能放下心中的成见\n看到真正的我 而不是你脑海里的画面\n\n[Outro]\n画面... 不安...\n偏见... 歧视...\n什么时候能停止..."
5
+ }
examples/text2music/example_21.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Chorus Hook, Melodic Rap, Ambient Synth Pads, adult, rap, Very Fast, Storytelling, Chinese Rap, male, spoken word, bright, energetic, Melodic Flow, clear, clarity, 130 bpm",
4
+ "lyrics": "[Intro]\n舌 头 打 结 了... 快 念 快 念...\n\n[Verse 1]\n这 个 赌 鬼 蹲 在 柜 台 啃 着 苦 瓜 干 快 很 干\n赌 桌 堆 满 骨 牌 古 怪 股 票 和 五 块 钢 镚 儿 钢 镚\n他 甩 出 扑 克 牌 啪 啪 啪 拍 扁 螃 蟹 壳 哦 壳 扁\n又 摸 摸 麻 将 摸 出 幺 鸡 摸 出 发 财 摸 出 一 条 蛇 蛇 蛇\n庄 家 咳 嗽 咳 破 锣 嗓 子 喊 开 开 开 快 开 开\n赌 鬼 咕 嘟 咕 嘟 灌 咖 啡 灌 到 筷 子 戳 穿 碗 快 戳 穿\n空 气 里 飘 着 锅 巴 味 混 合 隔 夜 的 酸 奶 罐 哦 酸\n输 光 裤 带 还 想 翻 盘 翻 成 煎 饼 摊 老 板 快 翻 盘\n\n[Chorus]\n赌 鬼 赌 鬼 哦 赌 鬼 赌 鬼 快 很 快\n舌 头 打 结 着 念 这 段 哦 这 段 绕 口 令 牌\n若 念 错 一 字 就 罚 你 哦 罚 你 吞 十 斤 海 带\n赌 场 规 矩 就 是 绕 晕 你 哦 绕 晕 你 快 很 快\n\n[Verse 2]\n他 掏 出 铜 板 抠 出 口 袋 最 后 一 颗 快 很 颗\n庄 家 哗 啦 哗 啦 摇 骰 子 摇 出 三 点 又 三 点 哦 三 点\n赌 鬼 急 得 咬 牙 切 齿 咬 到 舌 头 打 蝴 蝶 结 快 打 结\n还 想 押 上 祖 传 的 拖 鞋 拖 把 铁 锅 和 半 包 盐 盐 盐\n突 然 警 笛 嘀 嘟 嘀 嘟 吓 得 他 钻 进 垃 圾 罐 哦 垃 圾\n警 察 咔 嚓 咔 嚓 拍 照 拍 到 他 头 顶 菠 菜 叶 快 拍 照\n最 后 赌 鬼 蹲 监 狱 天 天 背 这 首 绕 口 令 哦 背 不 完\n若 背 错 一 句 就 加 刑 十 年 再 加 十 年 快 加 刑\n\n[Outro]\n舌 头 打 结 了... 赌 鬼 哭 了 哦...\n这 首 歌... 绕 死 人 了 哦..."
5
+ }
examples/text2music/example_22.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "electronic, hip-hop, rap, synthesizer, drums, vocals, fast, energetic, modern, uplifting, young adult, male, spoken word, singing, bright, energetic, clear, 140 bpm, female",
4
+ "lyrics": "[Verse 1]\n红鲤鱼绿鲤鱼,驴在河里追鲤鱼,\n驴追鲤鱼鱼躲驴,气得驴子直喘气。\n扁担长板凳宽,扁担绑在板凳边,\n扁担要绑板凳不让绑,扁担偏要绑上板凳面!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n红鲤鱼驴扁担板凳,\n一口气念完算你赢!\n\n[Verse 2]\n四是四十是十,十四是十四四十是四十,\n谁说四十是十四,舌头打结别放肆。\n黑化肥会挥发,灰化肥也发黑,\n化肥混一起,黑灰不分嘴发废!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n四十十四化肥灰,\n念错罚你唱十回!\n\n[Bridge]\n坡上立着一只鹅,坡下流着一条河,\n鹅要过河河渡鹅,河要渡鹅鹅笑河——\n到底谁更啰嗦?!\n\n[Outro]\n嘴皮子功夫别小瞧,\n绕口令rap我最飙,\n下次挑战准备好,\n舌头打结别求饶!"
5
+ }
examples/text2music/example_23.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "singing, bright, slightly nasal, energetic, spoken word, young adult, male, rap music",
4
+ "lyrics": "[Intro]\nYo, check it—speed demon, lyrical heat, uh!\nRatatat like a drum when the beat bumps, uh!\n\n[Verse 1]\nRapatapa tap tap, flash like a snap,\nRap tap tap, I don’t chat, I clap clap clap!\nFingers snap, flow don’t slack, rapataptaptap,\nSpit it fast, hit the gas, rap tap tap rap!\n\n[Pre-Chorus]\nBoom-bap, zoom past, leave ’em flat,\nRap taptaprapataptaptap—where ya at?\n\n[Chorus]\nRapatapa tap tap, yeah, I go brrrr,\nRap tap tap, make the crowd stir!\nRapataptaptap, no lag, just spit,\nRap taptaprapataptaptap—I’m lit!\n\n[Verse 2]\nTongue-twist, quick wrist, rapatapa boom,\nTap tap rap, leave ya stuck like glue-gum!\nNo slow-mo, turbo, rapataptaptap,\nRap tap rap, yeah, I clap clap clap!\n\n[Outro]\nRapatapa—TAP! Mic drop—that’s that."
5
+ }
examples/text2music/example_24.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "G-Funk, Hip Hop, Rap, Female Vocals, Melodic Rap, Summer, Laid-back Groove, Smooth Rhythm, Synthesizer Lead, Heavy Bassline, Groovy, West Coast Hip Hop",
4
+ "lyrics": "(Intro)\nOh yeah... \n\n(Verse 1)\n阳光下,沙滩排球场,一个身影跳跃\n小麦色,运动背心,闪耀活力四射\n她跳起扣杀,动作利落又巧妙\n汗水浸湿发梢,笑容比阳光更美好\n摇摆的节奏,是她的背景配乐\n每一次移动,都踩在鼓点上那么和谐\n我不由自主地停下脚步\n目光被她紧紧锁住\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n想走上前去搭讪,嫌自己笨拙呆板\n这青春的气息,耀眼,灿烂!\n\n(Verse 3)\n她和队友击掌庆祝,笑声清脆悦耳\n拿起毛巾擦汗,不经意间瞥我一眼\n鼓起勇气走上前,假装问问时间\n她友好地回答,笑容灿烂没有敷衍\n聊了几句,发现彼此爱这摇摆音乐\n她眼中也闪过惊喜和亲切\n这共同点,让气氛变得融洽又热烈!\n夏天的故事,就这样开始了感觉真切!\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n不再犹豫和等待,勇敢把脚步迈开\n这夏天的感觉,心跳,不断!"
5
+ }
examples/text2music/example_25.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "lyrical rap, young adult, female, rap flow, spoken word, ad-libs, bright, energetic, eat, Fast, Engaging, Energetic",
4
+ "lyrics": "[Intro]\n扁擔寬 板凳長 扁擔想綁在板凳上\n扁擔寬 板凳長 扁擔想綁在板凳上\n\n[Verse]\n倫敦 瑪莉蓮 買了 件 旗袍 送 媽媽\n莫斯科 的 夫司基 愛上 牛肉 麵 疙瘩\n各種 顏色 的 皮膚 各種 顏色 的 頭髮\n嘴裡念的 說的 開始 流行 中國話 (中國話)\n\n[Bridge]\n多少年 我們 苦練 英文 發音 和 文法 (yeah)\n這幾年 換他們 捲著 舌頭 學 平上去入 的 變化\n平平 仄仄 平平 仄\n好聰明 的 中國人 好優美 的 中國話\n\n[Verse]\n扁擔寬 板凳長 扁擔想綁在板凳上\n板凳不讓扁擔綁在板凳上 扁擔偏要綁在板凳上\n板凳偏偏不讓扁擔綁在那板凳上\n到底扁擔寬 還是板凳長?\n\n[Verse]\n哥哥弟弟坡前坐\n坡上臥著一隻鵝 坡下流著一條河\n哥哥說 寬寬的河 弟弟說 白白的鵝\n鵝要過河 河要渡鵝\n不知是那鵝過河 還是河渡鵝\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[Verse]\n紐約蘇珊娜開了間禪風 lounge bar\n柏林來的沃夫岡拿胡琴配著電吉他\n各種顏色的皮膚 各種顏色的頭髮\n嘴裡念的 說的 開始流行中國話 (中國話)\n\n[Bridge]\n多少年我們苦練英文發音和文法 (yeah)\n這幾年換他們捲著舌頭學平上去入的變化\n仄仄平平仄仄平\n好聰明的中國人 好優美的中國話\n\n[Verse]\n有個小孩叫小杜 上街打醋又買布\n買了布 打了醋 回頭看見鷹抓兔\n放下布 擱下醋 上前去追鷹和兔\n飛了鷹 跑了兔 灑了醋 濕了布\n\n[Verse]\n嘴說腿 腿說嘴\n嘴說腿 愛跑腿\n腿說嘴 愛賣嘴\n光動嘴 不動腿\n光動腿 不動嘴\n不如不長腿和嘴\n到底是那嘴說腿 還是腿說嘴?\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[outro]\n全世界都在學中國話 (在學中國話)\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 (讓他) 讓世界 (認真) 都認真聽話"
5
+ }
examples/text2music/example_26.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap",
4
+ "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球"
5
+ }
examples/text2music/example_27.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap, fast",
4
+ "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球"
5
+ }
examples/text2music/example_28.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "articulate, spoken word, young adult, rap music, female, clear, energetic, warm",
4
+ "lyrics": "[Intro]\n\"System booting... 语言 模型 loading...\"\n\n[Verse 1]\n硅谷 那个 coder 调试 neural network\n北京 的 极客 训练 A I 写 report\n不同 架构 的 chip 不同 算法 的 war\n屏幕上 跑的 全是 machine learning (learning)\n\n[Bridge]\n多少年 我们 chase 摩尔 定律 的 trend (yeah)\n这两年 换他们 study 中文 N L P\nConvolution L S T M\n好烧脑 的 backprop 好暴力 的 big data\n\n[Verse 2]\nPython 强 say加加 刚 Python 调用 C++ 的 A P I\nsay加加 嫌 Python 太 slow Python 笑 C++ 太 hardcore\nL L V M 默默 generate 中间 code\n到底 interpreter 还是 compiler 屌?\n\n[Verse 3]\nP M 和 engineer\n白板 画满 flow chart 服务器 闪着 red light\nP M 说 add feature engineer 说 no way\n需求 变更 code 重构\n不知 是 P M 太 fly 还是 deadline 太 high\n\n[Chorus]\n全世界 都在 train neural network\nTransformer 的 paper 越来越 难 go through\n全世界 都在 tune 超参数\n我们 写的 bug 让 G P U 都 say no\n\n[Verse 4]\n柏林 hackathon demo blockchain contract\n上海 的 dev 用 federated learning 破 data wall\n各种 语言 的 error 各种 框架 的 doc\nterminal 里 滚的 全是 dependency 冲突\n\n[Bridge]\n曾以为 English 才是 coding 的 language (yeah)\n直到见 G P T 用 文言文 generate 正则 expression\nGradient explode\n好硬核 的 prompt 好头秃 的 debug road\n\n[Verse 5]\n有个 bug 叫 quantum\n测试 环境 run perfect 上线 立即就 crash\n查 log 看 monitor 发现是 thread 不同步\n改 sync 加 lock 慢 deadlock 更难办\n量子 computer 也解不开 这 chaos chain\n\n[Verse 6]\n你说 996 我说 007\n你说 福报 我说 burnout\nProduct 要 agile Boss 要 KPI\nCode 要 elegant deadline 是 tomorrow\n不如 直接 script 自动 submit 离职信\n\n[Outro]\n\"Warning: 内存 leak...core dumping...\"\n全世界 都在 train neural network (neural network)\nLoss 还没 converge 天已经亮\n全世界 都在 tune 超参数\n我们 写的 code (让它) 让 world (reboot) 都 reboot 无效"
5
+ }
examples/text2music/example_29.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club, auto-tune, mumble rap, trap",
4
+ "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!"
5
+ }
examples/text2music/example_30.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "四川话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
4
+ "lyrics": "[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n老子 在 弄堂 斜起 走 想 拦路 的 先 报 名号\n我 早看透 你们 手抖 脚软\n只敢 网上 吠 现实 怂成 猫\n看 你们 混的 真 可怜 整天 蹲在 网吧 蹭 烟\n钱 赚不到 架 不敢打 还 学人 摆 大哥 脸\n\n[verse]\n叫 我 沪上 老 克勒 不是 拉菲 我 不 碰杯\n规矩 我 懒得 讲 太多 钞票 直接 拍 你 脸上 飞\n老子 耐心 差 门槛 高 你 找茬 等于 自 寻 烦恼\n要么 跪 要么 爬 最后 警告 只 说 一 遭\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n古巴 雪茄 在 指间 绕 代表 魔都 格调 必须 顶\nOG 在 你 够不到 的 高度 My bro 永远 在 顶层 盯\nCheck my vibe 不靠 大 金劳 留声机 放 周璇 和 白光\n爹妈 太 宠你 养出 巨婴 症 早晚 社会 教你 做人 经\n\n[verse]\n玩 说唱 小囡 太 年轻 要 比 flow 先去 练 气功\n廿年 磨 枪 才 亮 锋芒 我 三十六 招 收 你 入 瓮\n老子 存在 就是 打假 标\n多少 人 眼红 又 不敢 挑\n键盘 侠 的 狠话 像 棉花 糖\n见 真人 秒变 Hello Kitty 叫\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗"
5
+ }
examples/text2music/example_31.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Rap, Chinese Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
4
+ "lyrics": "(Intro)\nLet's drift away...\n\n(Verse 1)\n现实是灰色的格子间,重复的工作,枯燥的报表 \n敲打着键盘,眼神却放空,意识早已挣脱了肉体的镣铐\n飘向窗外,飞过拥挤的街道,穿过云层,到达想象的群岛\n那里色彩斑斓,形状奇异,逻辑失效,一切都随心所欲地飘摇\n迷幻的鼓点,像心跳的变奏,忽快忽慢,难以预料\n抽象的采样,扭曲的人声,构建一个超现实的音景环绕\n我变成一只鸟,一条鱼,一束光,自由地变换形态和奔跑\n在这白日梦里,我无所不能,摆脱了所有现实的烦恼, feeling the afterglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道 \n迷幻嘻哈的节拍,是白日梦的引导 \n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n\n(Verse 2)\n会议室里老板在讲话,声音模糊,像隔着水听不清道\n我的思绪,早已潜入深海,与发光的水母一起舞蹈\n或者飞向外太空,在星云间穿梭,探索未知的星球和轨道\n现实的规则,在这里被打破,物理定律也失去效劳\n白日梦是我的避难所,是精神的氧气罩\n在乏味的现实里,为我注入一点色彩和奇妙\n虽然短暂,虽然虚幻,但它让我能够喘息,重新把能量找到\n然后回到现实,继续扮演那个,循规蹈矩的角色,把梦藏好, keep the dream aglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道\n迷幻嘻哈的节拍,是白日梦的引导\n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n"
5
+ }
examples/text2music/example_32.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "J-Pop, Anime, kawaii future bass, Femal vocals, EDM, Boombap, Aggressive, Intense, Crisp Snare, Super Fast, Rap",
4
+ "lyrics": "[Intro]\nYo, 这是来自深渊的怒吼\n\n[Verse]\n指尖飞快刷新,屏幕又亮起\n渴望那点赞,像致命的氧气\n精心修饰的脸庞,完美到诡异\n背后隐藏的疲惫,谁又会在意\n光鲜亮丽的橱窗,贩卖着焦虑\n每个人都在表演,戴着虚伪面具\n比较的游戏,让人逐渐窒息\n迷失在数据洪流,找不到自己\n\n[Chorus]\n这流量的时代,真假早已分不清\n盲目追随潮流,丢掉了初心\n为了那点虚荣,灵魂在沉沦\n看不见的锁链,捆绑每个灵魂\n\n[Verse]\n滤镜下的生活,美得不切实际\n营造虚假繁荣,掩盖内心空虚\n他人的光环下,显得自己多余\n嫉妒和自卑,交织成悲剧\n\n[Chorus]\n朋友圈里炫耀,现实中却叹气\n刷着别人的故事,忘记了呼吸\n算法推荐着你,想看的一切东西\n不知不觉间,你已不再是你\n他们说这是进步,我看是种病\n精神鸦片侵蚀,慢慢要了你的命\n\n[Bridge]\n屏幕亮了又暗,一天又过去\n究竟得到了什么,还是失去了自己\n那真实的连接,在何处寻觅\n困在这迷宫里,找不到出口的轨迹\n\n[Outro]\n我想挣脱,我想呼吸\n这虚拟的繁华,让我喘不过气\n谁能告诉我,这到底有什么意义\n一切都像泡沫,一触就破裂没余地"
5
+ }
examples/text2music/example_33.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Hip Hop, Hi-hat Rolls, spoken word, Melodic Flow, articulate, Female Rap, 120 BPM, clear, warm, female, melodic Rap, adult, super fast",
4
+ "lyrics": "[Verse 1]\n打南边来了个喇嘛,手里提拉着五斤鳎目,\n打北边来了个哑巴,腰里别着个喇叭。\n喇嘛想换哑巴的喇叭,哑巴摇头不说话,\n鳎目一甩像道闪电,喇叭一响震天涯!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 2]\n墙上一根钉,钉下绳摇晃,\n绳吊着瓶,瓶碰碎了光。\n灯骂瓶,瓶怪绳,绳怨钉,\n稀里哗啦,一场荒唐!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 3]\n板凳宽,扁担长,\n一个偏要绑,一个偏不让。\n青龙洞里龙翻身,\n千年大梦变稻香!\n\n[Bridge]\n麻婆婆的狗,咬破麻叉口,\n麻线穿针眼,补丁也风流。\n左一句,右一句,\n舌头打结心自由!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!"
5
+ }
examples/text2music/example_34.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "东北话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright",
4
+ "lyrics": "[verse]\n挣着 憋屈的 工资 还得 装乐呵\n猫着 怂样儿 还搁 朋友圈 嘚瑟\n扛着 傻逼的 指标 没人 搭把手\n这儿 不是 托儿所 少整 那出儿 哭唧尿嚎\n\n俺们 就像 一条条 老板的 裤衩子\n陪着 笑脸 接他 每一回 突突\n哎呦 老板 今儿个 穿我呗\n他 撅个腚 眼角 瞟你 那熊样\n\n[chorus]\n他们 骂我 打工仔 太多人 没睡醒\n寻思 抠搜 老板 一天天 穷折腾\n不想 俺的 人生 烂在 这嘎达\n不想 俺的 将来 折在 这破棚\n\n老子 不想 上班 老子 是外星人\n你都 把俺 骂急眼了 俺还 这么淡定\n现实 才是 梦 啥时候 能醒啊\n那 糟践人的 答案 在西北风 里飘\n\n[verse]\n瞅见 二愣子 同事 给老板 舔腚沟子\n瞅见 浪蹄子 女同事 在老板 胯骨轴 扭搭\n瞅见 白瞎的 光阴 耗在 没亮儿的 道儿\n瞅见 公交车上 一帮 僵尸 吐酸水\n\n瞅见 俺的 命 定在 苦逼的 坑里\n瞅见 俺的 爱情 被轮了 成了 老处女\n瞅见 好事儿 全归 高富帅\n还有 那些 臭不要脸 扭腚的 货色\n\n[chorus](重复)\n他们 骂我 打工仔 太多人 没睡醒...\n\n[bridge]\n加班 没补助 俺认了\n欠薪 揍员工 把俺 当牲口\n去你妈 的小姘头\n\n[verse]\n破逼 管理制度 净整 娱乐八卦\n撸管式 管理 也就 你自己 嗨\n出点儿 屁事儿 就往 下属 脑瓜子 扣\n挣俩 钢镚儿 立马 牛逼 不分 公母\n\n你挖个 大坑 把俺们 往里 踹\n说这 叫梦想 你当年 多能耐\n俺们 就当 听传销 洗脑课\n可怜 连骗人 你都 就会 这一套\n\n[outro]\n老子 不想 上班\n老子 不想 上班\n老子 不想 上班"
5
+ }
examples/text2music/example_35.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear",
4
+ "lyrics": "[Intro]\nNya.\n\n[Verse]\n我 在 五 点 二 十 早 起,十 三 点 十 四 弹 会儿 琴\n习 惯 了 坐 班,习惯了 隔夜 的 剩 饭,\n习 惯 了 没有 你\n\n[Verse]\n怕 你 想 不 开,拦 在 你 的 面 前\n那 时 候 摔 得 差 点 住 院\n东 京 的 春 天 莺 莺 燕 燕\n我 说 想 不 想 来 跟 我 玩 音乐\n\n[Verse]\n带 着 我 的 朋 友 守 在 你 的 门 口\n弹 着 我 的 钢 琴 当 伴 奏\n等 你 放 学 后,陪 你 K T V\n端 着 我 的 红 茶 跟 你 碰 杯\n\n[Pre-Chorus]\n忽然间现实淹没了远方\n万家灯火,盖住月光\n奔走,忍受,变成了人偶\n别再对我伸出你的 双 手,会 受 伤\n\n[Chorus]\n明明都向前走,方向却渐渐不同\n时间让你我越走越近,却越来越陌生\n春 天 在 滂 沱 的 大 雨 里 飘 落\n得 了 心 太 高 脸 太 薄 的病\n\n[Bridge]\n我越难过,春日影越顶\n眼泪晃得我看不清\n埋葬了懦弱还有矫情\n却还是会在半夜摸眼睛\n\n青春期大部分时间在工 作\n用微笑换来余额几个零\n戴上了面具也明白了生活\n拼的是数字和脸更是命\n\n[Verse]\n我在五点二十早起,十三点十四弹会琴\n早上要做饭,回家时满地的瓶罐\n\n师 徒 二 人 站 在 我 的 面 前\n台 词 很 熟 练,照 着 就 念\n\n背 后 的 小 睦 扭 扭 捏 捏\n我 说 我 还 有 点 事 要 不 改 天 见\n\n然 后 你 的 双手 握 住 我 的 袖 口\n开 始 哭 着 求 我 不 要 走\n\n[Verse]\n我在下班后,忙活柴米油\n你和你的姐妹住着高楼\n\n苦 来 兮 苦,早 就 没 了\n现 实 扬 鞭,赶 着 我 向 前\n没有时间跟你分辨什么对与错\n\n[Bridge]\n没有什么对错,没有罪过\n谁不曾天真,是我太早看破\n生活一片狼藉,却又不想放弃\n一 边 聚 光 灯 下 绽 放,一 边 坠 落\n故作坚强,筑起心的墙\n越是委屈的伤口,越要藏\nLet it all out, it’s all right\n\n[Outro]\n俺 是 东 京 嘞,东 京 打 工 妹\n\n从虎之门带你转到浅草\n再从新宿转到竹桥\n\n俺 是 东 京 嘞,东 京 打 工 妹\n\n带 你 转 羽田 成田 蒲田 神田\n做 你 嘞 小 甜 甜!\n\n俺 是 东 京 嘞,东 京 打 工 妹\n带 你 转 赤 坂,带 你 转 霞 关\n恁 咋 不 早 说,今 天 不 管 饭\n"
5
+ }
examples/text2music/example_36.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Rap, adult, male, spoken word, rapping, clear, warm, articulate, Lo-Fi Hip Hop, 100-120 BPM, Keyboard Chords, Male Rap, Lazy Rhythm, Melancholy, Rap",
4
+ "lyrics": "[Intro]\n夜色 很 淡 像 褪色 的 照片 \n但 记忆 却 像 刀锋 一样 锐利 \n\n[Verse 1]\n你 说过 的 甜言蜜语 现在 听来 像 最 恶毒 的 咒骂 \n你 刺进 我 心里 的 刀 现在 还 在 滴血 未 干 哪 \n慵懒 的 旋律 像 我 的 脚步 拖着 沉重 的 躯壳 \n脑海 里 循环 播放 那 画面 快 把 我 逼疯 了 \n键盘 和弦 低沉 又 忧伤 弹奏 着 我 的 绝望 \n我 曾经 的 信任 像 玻璃 一样 被 你 狠狠 地 摔 在 地上 \n不想 振作 不想 原谅 只 想 让 这 一切 都 停止 \n可 心底 有 个 声音 嘶吼 着 要 你 付出 该 有 的 代价 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Verse 2]\n曾经 的 兄弟 现在 面目全非 像 个 陌生人 \n你 的 自私 像 癌细胞 一点点 吞噬 我 的 纯真 \n我 学着 你 的 样子 把 心 锁 起来 不再 轻易 相信 \n让 懒散 的 节奏 包裹 我 给 自己 一点 喘息 \n键盘 的 音色 变得 更加 阴冷 像 秋天 的 雨滴 \n冲刷 掉 所有 温情 只 剩下 彻骨 的 寒意 \n我 不会 大喊大叫 只是 默默 地 计划 \n每 一步 都 走向 让 你 后悔 的 那 一 刹那 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Bridge]\n也许 复仇 不能 带来 平静 \n也许 只 会 让 我 更 堕落 \n但 如果 不 这样 做 \n我 连 活下去 的 勇气 都 没有 \n\n[Outro]\n复仇 复仇 复仇 \n直到 最后 一刻 \n懒散 地 复仇 着 "
5
+ }
examples/text2music/example_37.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "think": true,
3
+ "caption": "Orchestra, Symphony, Sonata, Opera, Concerto, Rap, Beat, DJ, MC, StreetCulture",
4
+ "lyrics": "[verse1]\n羊皮卷轴 墨香飘 莫扎特 熬 安魂曲 通宵 \n和弦齿轮 咔哒转 比 瑞士 手表 更 精密 律动 \n八轨磁带 玩叠叠乐 披头士 炸 录音棚 天花板 \nAI 卷起 新风暴 像 灭霸 打响指 般 简单 \n\n[chorus]\n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse2]\n春之祭 召唤 百人 乐团 才够 燥 \n合成器 极客 玩电焊 焊出 赛博 神庙 \nDAW 解放 双手 钢琴卷帘 变 乐高 \n音色库 开挂 像 吃 金币 的 马里奥 \n\nAI 拆解 爵士乐 黑话 像 庖丁 解牛 \nCityPop 复古 滤镜 直接 参数 调油 \n神经网络 偷师 贝多芬 半夜 翻墙头 \n音乐 基因库 被 改写成 超频 万花筒 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse3] \n电子琴 被 吐槽 塑料 味 超标 \n卧室 制作人 用 鼠标 单挑 整个 乐团 编制 \nAI 伴奏 刚上线 就被 键盘侠 集火 \n却 忘了 电吉他 曾被 说 是 魔鬼 的 副歌 \n\n现在 我 指尖 蹦迪 在 数据 炼丹炉 \n提示词 召唤 莫扎特 跨次元 碰杯 珍珠奶茶 \n当 比特 海洋 淹没 所有 物理 琴柱 \n最后 的 音轨 永远 连着 心脏 的 跳针 \n\n[bridge] \n鹅毛笔 蘸着 银河 当 墨汁(绝了) \n音浪 在 元宇宙 开 分店(疯了) \n技术 迷雾 散成 像素 烟花 \n而 我们 始终 带着 老派 的 心跳 混搭 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[outro] \n从 蒸汽 到 硅基 浪潮 我 冲浪(yo) \n用 脑洞 接住 每个 技术 暴击(叮) \n当 所有 设备 没电 的 凌晨 三点钟 \n最 原始 的 旋律 在 胸腔 敲击 成 龙卷风 "
5
+ }
test.py DELETED
@@ -1,212 +0,0 @@
1
- import os
2
- import sys
3
- import torch
4
- import shutil
5
- if sys.platform != 'win32':
6
- import resource
7
- else:
8
- import psutil
9
- from acestep.handler import AceStepHandler
10
-
11
-
12
- def main():
13
- print("Initializing AceStepHandler...")
14
- handler = AceStepHandler()
15
-
16
- # Find checkpoints
17
- checkpoints = handler.get_available_checkpoints()
18
- if checkpoints:
19
- project_root = checkpoints[0]
20
- else:
21
- # Fallback
22
- current_file = os.path.abspath(__file__)
23
- project_root = os.path.join(os.path.dirname(current_file), "checkpoints")
24
-
25
- print(f"Project root (checkpoints dir): {project_root}")
26
-
27
- # Find models
28
- models = handler.get_available_acestep_v15_models()
29
- if not models:
30
- print("No models found. Using default 'acestep-v15-turbo'.")
31
- model_name = "./acestep-v15-turbo"
32
- else:
33
- model_name = models[0]
34
- print(f"Found models: {models}")
35
- print(f"Using model: {model_name}")
36
-
37
- # Initialize service
38
-
39
- use_llm = False
40
-
41
- status, enabled = handler.initialize_service(
42
- project_root=project_root,
43
- config_path=model_name,
44
- device='auto',
45
- use_flash_attention=True, # Default in UI
46
- compile_model=True,
47
- offload_to_cpu=True,
48
- offload_dit_to_cpu=False, # Keep DiT on GPU
49
- quantization="int8_weight_only", # Enable FP8 weight-only quantization
50
- )
51
-
52
- if not enabled:
53
- print(f"Error initializing service: {status}")
54
- return
55
-
56
- print(status)
57
- print("Service initialized successfully.")
58
-
59
- # Prepare inputs
60
- captions = "A soft pop arrangement led by light, fingerpicked guitar sets a gentle foundation, Airy keys subtly fill the background, while delicate percussion adds warmth, The sweet female voice floats above, blending naturally with minimal harmonies in the chorus for an intimate, uplifting sound"
61
-
62
- lyrics = """[Intro]
63
-
64
- [Verse 1]
65
- 风吹动那年仲夏
66
- 翻开谁青涩喧哗
67
- 白枫书架
68
- 第七页码
69
-
70
- [Verse 2]
71
- 珍藏谁的长发
72
- 星夜似手中花洒
73
- 淋湿旧忆木篱笆
74
- 木槿花下
75
- 天蓝发夹
76
- 她默认了他
77
-
78
- [Bridge]
79
- 时光将青春的薄荷红蜡
80
- 匆匆地融化
81
- 她却沉入人海再无应答
82
- 隐没在天涯
83
-
84
- [Chorus]
85
- 燕子在窗前飞掠
86
- 寻不到的花被季节带回
87
- 拧不干的思念如月
88
- 初恋颜色才能够描绘
89
-
90
- 木槿在窗外落雪
91
- 倾泻道别的滋味
92
- 闭上眼听见微咸的泪水
93
- 到后来才知那故梦珍贵
94
-
95
- [Outro]"""
96
-
97
- seeds = "320145306, 1514681811"
98
-
99
- print("Starting generation...")
100
-
101
- # Generate hints using 5Hz LLM
102
- if use_llm:
103
- print("Generating hints using 5Hz LLM...")
104
- lm_temperature = 0.6
105
- metadata, audio_codes, lm_status = handler.generate_with_5hz_lm(captions, lyrics, lm_temperature)
106
- print(f"5Hz LLM Status: {lm_status}")
107
- print(f"Generated Metadata: {metadata}")
108
- print(f"Generated Audio Codes (first 50 chars): {audio_codes[:50]}...")
109
- else:
110
- print("Skipping 5Hz LLM generation...")
111
- metadata = {
112
- 'bpm': 90,
113
- 'keyscale': 'A major',
114
- 'timesignature': '4',
115
- 'duration': 240,
116
- }
117
- audio_codes = None
118
- lm_status = "Skipped"
119
-
120
- # Use generated metadata if available
121
- bpm = metadata.get('bpm', 90)
122
- if bpm == "N/A" or bpm == "":
123
- bpm = 90
124
- else:
125
- try:
126
- bpm = int(float(bpm))
127
- except:
128
- bpm = 90
129
-
130
- key_scale = metadata.get('keyscale', metadata.get('key_scale', "A major"))
131
- if key_scale == "N/A":
132
- key_scale = "A major"
133
-
134
- time_signature = metadata.get('timesignature', metadata.get('time_signature', "4"))
135
- if time_signature == "N/A":
136
- time_signature = "4"
137
-
138
- audio_duration = metadata.get('duration', 120)
139
- if audio_duration == "N/A":
140
- audio_duration = 120
141
- else:
142
- try:
143
- audio_duration = float(audio_duration)
144
- except:
145
- audio_duration = 120
146
-
147
- print(f"Using parameters: BPM={bpm}, Key={key_scale}, Time Sig={time_signature}, Duration={audio_duration}")
148
-
149
- # Reset peak memory stats
150
- if hasattr(torch, 'xpu') and torch.xpu.is_available():
151
- torch.xpu.reset_peak_memory_stats()
152
- elif torch.cuda.is_available():
153
- torch.cuda.reset_peak_memory_stats()
154
-
155
- # Call generate_music
156
- results = handler.generate_music(
157
- captions=captions,
158
- lyrics=lyrics,
159
- bpm=bpm,
160
- key_scale=key_scale,
161
- time_signature=time_signature,
162
- vocal_language="zh",
163
- inference_steps=8,
164
- guidance_scale=7.0,
165
- use_random_seed=False,
166
- seed=seeds,
167
- audio_duration=audio_duration,
168
- batch_size=1,
169
- task_type="text2music",
170
- cfg_interval_start=0.0,
171
- cfg_interval_end=0.95,
172
- audio_format="wav",
173
- use_tiled_decode=True,
174
- audio_code_string=audio_codes,
175
- )
176
-
177
- # Unpack results
178
- (audio1, audio2, saved_files, info, status_msg, seed_val,
179
- align_score1, align_text1, align_plot1,
180
- align_score2, align_text2, align_plot2) = results
181
-
182
- print("\nGeneration Complete!")
183
-
184
- # Print memory stats
185
- if hasattr(torch, 'xpu') and torch.xpu.is_available():
186
- peak_vram = torch.xpu.max_memory_allocated() / (1024 ** 3)
187
- print(f"Peak VRAM usage: {peak_vram:.2f} GB")
188
- elif torch.cuda.is_available():
189
- peak_vram = torch.cuda.max_memory_allocated() / (1024 ** 3)
190
- print(f"Peak VRAM usage: {peak_vram:.2f} GB")
191
-
192
- if sys.platform != 'win32':
193
- peak_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (1024 ** 2)
194
- else:
195
- process = psutil.Process()
196
- peak_ram = process.memory_info().rss / (1024 ** 3)
197
-
198
- print(f"Peak RAM usage: {peak_ram:.2f} GB")
199
- print(f"Status: {status_msg}")
200
- print(f"Info: {info}")
201
- print(f"Seeds used: {seed_val}")
202
- print(f"Saved files: {saved_files}")
203
-
204
- # Copy files
205
- for f in saved_files:
206
- if os.path.exists(f):
207
- dst = os.path.basename(f)
208
- shutil.copy(f, dst)
209
- print(f"Saved output to: {os.path.abspath(dst)}")
210
-
211
- if __name__ == "__main__":
212
- main()