Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

Gong Junmin commited on Dec 17, 2025

Commit

11a221a

1 Parent(s): 509f9f2

first commit

Browse files

Files changed (31) hide show

.gitignore +4 -0
LICENSE +246 -201
acestep/acestep_v15_pipeline.py +67 -0
acestep/gradio_ui.py +744 -0
acestep/handler.py +1100 -0
acestep/third_parts/nano-vllm/LICENSE +21 -0
acestep/third_parts/nano-vllm/README.md +66 -0
acestep/third_parts/nano-vllm/assets/logo.png +3 -0
acestep/third_parts/nano-vllm/bench.py +32 -0
acestep/third_parts/nano-vllm/example.py +33 -0
acestep/third_parts/nano-vllm/nanovllm/__init__.py +2 -0
acestep/third_parts/nano-vllm/nanovllm/config.py +26 -0
acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py +112 -0
acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py +120 -0
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +315 -0
acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py +222 -0
acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py +89 -0
acestep/third_parts/nano-vllm/nanovllm/layers/activation.py +14 -0
acestep/third_parts/nano-vllm/nanovllm/layers/attention.py +75 -0
acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py +66 -0
acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py +50 -0
acestep/third_parts/nano-vllm/nanovllm/layers/linear.py +153 -0
acestep/third_parts/nano-vllm/nanovllm/layers/rotary_embedding.py +61 -0
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +15 -0
acestep/third_parts/nano-vllm/nanovllm/llm.py +5 -0
acestep/third_parts/nano-vllm/nanovllm/models/qwen3.py +215 -0
acestep/third_parts/nano-vllm/nanovllm/sampling_params.py +13 -0
acestep/third_parts/nano-vllm/nanovllm/utils/context.py +27 -0
acestep/third_parts/nano-vllm/nanovllm/utils/loader.py +28 -0
acestep/third_parts/nano-vllm/pyproject.toml +27 -0
requirements.txt +4 -0

.gitignore CHANGED Viewed

@@ -205,3 +205,7 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/

 marimo/_static/
 marimo/_lsp/
 __marimo__/
+tests/
+checkpoints/
+playground.ipynb
+.history/

LICENSE CHANGED Viewed

@@ -1,201 +1,246 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

+================================================================================
+                          ACE-STEP LICENSE
+                            Version 1.0
+================================================================================
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+--------------------------------------------------------------------------------
+1. DEFINITIONS
+   "License" shall mean the terms and conditions for use, reproduction, and
+   distribution as defined by Sections 1 through 11 of this document.
+   "Licensor" shall mean ACE Studio (or the copyright owner/entity authorized
+   by ACE Studio) that is granting the License.
+   "Legal Entity" shall mean the union of the acting entity and all other
+   entities that control, are controlled by, or are under common control with
+   that entity. For the purposes of this definition, "control" means (i) the
+   power, direct or indirect, to cause the direction or management of such
+   entity, whether by contract or otherwise, or (ii) ownership of fifty
+   percent (50%) or more of the outstanding shares, or (iii) beneficial
+   ownership of such entity.
+   "You" (or "Your") shall mean an individual or Legal Entity exercising
+   permissions granted by this License.
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation source,
+   configuration files, and model training code.
+   "Object" form shall mean any form resulting from mechanical transformation
+   or translation of a Source form, including but not limited to compiled
+   object code, generated documentation, and conversions to other media types.
+   "Work" shall mean the work of authorship, whether in Source or Object form,
+   made available under the License, as indicated by a copyright notice that
+   is included in or attached to the work. For the avoidance of doubt, "Work"
+   explicitly includes the Model Weights, parameters, and configuration files
+   provided by the Licensor.
+   "Derivative Works" shall mean any work, whether in Source or Object form,
+   that is based on (or derived from) the Work and for which the editorial
+   revisions, annotations, elaborations, or other modifications represent, as
+   a whole, an original work of authorship. For the purposes of this License,
+   "Derivative Works" shall explicitly include "Derivative Models," defined as
+   any modifications to the Model Weights, including but not limited to
+   Fine-tunes, LoRAs (Low-Rank Adaptation), adapters, and other distinct
+   parameter sets derived from the Work.
+   "Output" shall mean any audio, music, sound recordings, or data generated
+   by the use or execution of the Work or Derivative Works.
+   "Contribution" shall mean any work of authorship, including the original
+   version of the Work and any modifications or additions to that Work or
+   Derivative Works thereof, that is intentionally submitted to Licensor for
+   inclusion in the Work by the copyright owner or by an individual or Legal
+   Entity authorized to submit on behalf of the copyright owner.
+   "Contributor" shall mean Licensor and any individual or Legal Entity on
+   behalf of whom a Contribution has been received by Licensor and subsequently
+   incorporated within the Work.
+--------------------------------------------------------------------------------
+2. GRANT OF COPYRIGHT LICENSE
+   Subject to the terms and conditions of this License (including the specific
+   restrictions in Section 5 and Section 6), each Contributor hereby grants to
+   You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+   irrevocable copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the Work and
+   such Derivative Works in Source or Object form.
+--------------------------------------------------------------------------------
+3. GRANT OF PATENT LICENSE
+   Subject to the terms and conditions of this License, each Contributor
+   hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their Contribution(s)
+   alone or by combination of their Contribution(s) with the Work to which such
+   Contribution(s) was submitted.
+   If You institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+   Contribution incorporated within the Work constitutes direct or contributory
+   patent infringement, then any patent licenses granted to You under this
+   License for that Work shall terminate as of the date such litigation is
+   filed.
+--------------------------------------------------------------------------------
+4. REDISTRIBUTION
+   You may reproduce and distribute copies of the Work or Derivative Works
+   thereof in any medium, with or without modifications, and in Source or
+   Object form, provided that You meet the following conditions:
+   (a) You must give any other recipients of the Work or Derivative Works a
+       copy of this License; and
+   (b) You must cause any modified files to carry prominent notices stating
+       that You changed the files; and
+   (c) You must retain, in the Source form of any Derivative Works that You
+       distribute, all copyright, patent, trademark, and attribution notices
+       from the Source form of the Work, excluding those notices that do not
+       pertain to any part of the Derivative Works; and
+   (d) If the Work includes a "NOTICE" text file as part of its distribution,
+       then any Derivative Works that You distribute must include a readable
+       copy of the attribution notices contained within such NOTICE file,
+       excluding those notices that do not pertain to any part of the
+       Derivative Works, in at least one of the following places: within a
+       NOTICE text file distributed as part of the Derivative Works; within
+       the Source form or documentation, if provided along with the Derivative
+       Works; or, within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents of the
+       NOTICE file are for informational purposes only and do not modify the
+       License. You may add Your own attribution notices within Derivative
+       Works that You distribute, alongside or as an addendum to the NOTICE
+       text from the Work, provided that such additional attribution notices
+       cannot be construed as modifying the License.
+   (e) Community Contribution Requirement: If You create a Derivative Work
+       (specifically a Derivative Model, such as a Fine-tune or LoRA) and You
+       distribute it, publicly perform it, or use it to generate publicly
+       available Output, You must make the Source form (including the weights,
+       parameters, and training configuration) of said Derivative Work publicly
+       available under the terms of this License. This clause ensures that
+       improvements to the model remain accessible to the community.
+--------------------------------------------------------------------------------
+5. RESTRICTIONS ON COMMERCIAL SERVICES (THE "ANTI-SAAS" CLAUSE)
+   Notwithstanding the grants in Section 2, You are strictly prohibited from
+   using the Work or Derivative Works to operate, promote, or distribute a
+   commercial service where the primary value provided to users is the ability
+   to generate Outputs using the Work. This includes, but is not limited to:
+   (a) Offering the Work as a hosted Application Programming Interface (API);
+   (b) Offering the Work as a Software-as-a-Service (SaaS) product;
+   (c) Integrating the Work into a commercial platform that charges users
+       specifically for generation capabilities.
+   Note: You may use the Work to develop independent software tools, plugins,
+   or applications (e.g., local plugins for DAWs), provided that such tools
+   run locally on end-users' hardware and do not violate the restrictions on
+   hosted commercial generation services defined above.
+--------------------------------------------------------------------------------
+6. OWNERSHIP AND COMMERCIALIZATION OF OUTPUTS
+   (a) Personal and Non-Commercial Use: You are free to use Outputs generated
+       by the Work for personal use, research, educational purposes, and
+       non-commercial creative projects (e.g., background music for personal
+       videos) without restriction.
+   (b) Commercial Use and Verification: The Licensor provides the Work without
+       any default warranty of copyright ownership for raw Outputs. To use
+       Outputs for Commercial Purposes (including but not limited to
+       distributing to music streaming platforms such as Spotify, Apple Music,
+       or YouTube Music, or registering Content ID), You must obtain Proof of
+       Human Creation or authorization through ACE Studio (or channels
+       officially designated by the Licensor). "Commercial Purposes" implies
+       intent to profit from the direct exploitation of the generated audio.
+   (c) Prohibition on Mass Generation: You are expressly prohibited from using
+       the Work to generate and distribute mass quantities of content
+       ("spamming") for the purpose of flooding streaming services,
+       manipulating royalty systems, or engaging in automated content farming.
+--------------------------------------------------------------------------------
+7. SUBMISSION OF CONTRIBUTIONS
+   Unless You explicitly state otherwise, any Contribution intentionally
+   submitted for inclusion in the Work by You to the Licensor shall be under
+   the terms and conditions of this License, without any additional terms or
+   conditions. Notwithstanding the above, nothing herein shall supersede or
+   modify the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+--------------------------------------------------------------------------------
+8. TRADEMARKS
+   This License does not grant permission to use the trade names, trademarks,
+   service marks, or product names of the Licensor, except as required for
+   reasonable and customary use in describing the origin of the Work and
+   reproducing the content of the NOTICE file.
+--------------------------------------------------------------------------------
+9. DISCLAIMER OF WARRANTY
+   Unless required by applicable law or agreed to in writing, Licensor provides
+   the Work (and each Contributor provides its Contributions) on an "AS IS"
+   BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions of
+   TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR
+   PURPOSE. You are solely responsible for determining the appropriateness of
+   using or redistributing the Work and assume any risks associated with Your
+   exercise of permissions under this License.
+--------------------------------------------------------------------------------
+10. LIMITATION OF LIABILITY
+    In no event and under no legal theory, whether in tort (including
+    negligence), contract, or otherwise, unless required by applicable law
+    (such as deliberate and grossly negligent acts) or agreed to in writing,
+    shall any Contributor be liable to You for damages, including any direct,
+    indirect, special, incidental, or consequential damages of any character
+    arising as a result of this License or out of the use or inability to use
+    the Work (including but not limited to damages for loss of goodwill, work
+    stoppage, computer failure or malfunction, or any and all other commercial
+    damages or losses), even if such Contributor has been advised of the
+    possibility of such damages.
+--------------------------------------------------------------------------------
+11. ACCEPTING WARRANTY OR ADDITIONAL LIABILITY
+    While redistributing the Work or Derivative Works thereof, You may choose
+    to offer, and charge a fee for, acceptance of support, warranty,
+    indemnity, or other liability obligations and/or rights consistent with
+    this License. However, in accepting such obligations, You may act only on
+    Your own behalf and on Your sole responsibility, not on behalf of any
+    other Contributor, and only if You agree to indemnify, defend, and hold
+    each Contributor harmless for any liability incurred by, or claims asserted
+    against, such Contributor by reason of your accepting any such warranty or
+    additional liability.
+================================================================================
+                      END OF TERMS AND CONDITIONS
+================================================================================

acestep/acestep_v15_pipeline.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+ACE-Step V1.5 Pipeline
+Handler wrapper connecting model and UI
+"""
+import os
+import sys
+# Clear proxy settings that may affect Gradio
+for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
+    os.environ.pop(proxy_var, None)
+os.environ['CUDA_VISIBLE_DEVICES'] = '7'  # Adjust as needed
+from .handler import AceStepHandler
+from .gradio_ui import create_gradio_interface
+def create_demo():
+    """
+    Create Gradio demo interface
+    Returns:
+        Gradio Blocks instance
+    """
+    # Create handler instance (business logic processor)
+    handler = AceStepHandler()
+    # Create Gradio interface
+    demo = create_gradio_interface(handler)
+    return demo
+def main():
+    """Main entry function"""
+    import argparse
+    parser = argparse.ArgumentParser(description="Gradio Demo for ACE-Step V1.5")
+    parser.add_argument("--port", type=int, default=7860, help="Port to run the gradio server on")
+    parser.add_argument("--share", action="store_true", help="Create a public link")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
+    args = parser.parse_args()
+    try:
+        # Create and launch demo
+        print("Creating Gradio interface...")
+        demo = create_demo()
+        print(f"Launching server on {args.server_name}:{args.port}...")
+        demo.launch(
+            server_name=args.server_name,
+            server_port=args.port,
+            share=args.share,
+            debug=args.debug,
+            show_error=True,
+            prevent_thread_lock=False,  # Keep thread locked to maintain server running
+            inbrowser=False,  # Don't auto-open browser
+        )
+    except Exception as e:
+        print(f"Error launching Gradio: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

acestep/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,744 @@

+"""
+Gradio UI Components Module
+Contains all Gradio interface component definitions and layouts
+"""
+import gradio as gr
+from typing import Callable, Optional
+def create_gradio_interface(handler) -> gr.Blocks:
+    """
+    Create Gradio interface
+    Args:
+        handler: Business logic handler instance
+    Returns:
+        Gradio Blocks instance
+    """
+    with gr.Blocks(
+        title="ACE-Step V1.5 Demo",
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        .section-header {
+            background: linear-gradient(90deg, #4CAF50, #45a049);
+            color: white;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        """
+    ) as demo:
+        gr.HTML("""
+        <div class="main-header">
+            <h1>♪ACE-Step V1.5 Demo</h1>
+            <p>Generate music from text captions and lyrics using diffusion models</p>
+        </div>
+        """)
+        # Dataset Explorer Section
+        dataset_section = create_dataset_section(handler)
+        # Generation Section
+        generation_section = create_generation_section(handler)
+        # Results Section
+        results_section = create_results_section(handler)
+        # Connect event handlers
+        setup_event_handlers(demo, handler, dataset_section, generation_section, results_section)
+    return demo
+def create_dataset_section(handler) -> dict:
+    """Create dataset explorer section"""
+    with gr.Group():
+        gr.HTML('<div class="section-header"><h3>📊 Dataset Explorer</h3></div>')
+        with gr.Row(equal_height=True):
+            dataset_type = gr.Dropdown(
+                choices=["train", "test"],
+                value="train",
+                label="Dataset",
+                info="Choose dataset to explore",
+                scale=2
+            )
+            import_dataset_btn = gr.Button("📥 Import Dataset", variant="primary", scale=1)
+            search_type = gr.Dropdown(
+                choices=["keys", "idx", "random"],
+                value="random",
+                label="Search Type",
+                info="How to find items",
+                scale=1
+            )
+            search_value = gr.Textbox(
+                label="Search Value",
+                placeholder="Enter keys or index (leave empty for random)",
+                info="Keys: exact match, Index: 0 to dataset size-1",
+                scale=2
+            )
+        instruction_display = gr.Textbox(
+            label="📝 Instruction",
+            interactive=False,
+            placeholder="No instruction available",
+            lines=1
+        )
+        repaint_viz_plot = gr.Plot()
+        with gr.Accordion("📋 Item Metadata (JSON)", open=False):
+            item_info_json = gr.Code(
+                label="Complete Item Information",
+                language="json",
+                interactive=False,
+                lines=15
+            )
+        with gr.Row(equal_height=True):
+            item_src_audio = gr.Audio(
+                label="Source Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            get_item_btn = gr.Button("🔍 Get Item", variant="secondary", interactive=False, scale=2)
+        with gr.Row(equal_height=True):
+            item_target_audio = gr.Audio(
+                label="Target Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            item_refer_audio = gr.Audio(
+                label="Reference Audio",
+                type="filepath",
+                interactive=False,
+                scale=2
+            )
+        with gr.Row():
+            use_src_checkbox = gr.Checkbox(
+                label="Use Source Audio from Dataset",
+                value=True,
+                info="Check to use the source audio from dataset"
+            )
+        data_status = gr.Textbox(label="📊 Data Status", interactive=False, value="❌ No dataset imported")
+        auto_fill_btn = gr.Button("📋 Auto-fill Generation Form", variant="primary")
+    return {
+        "dataset_type": dataset_type,
+        "import_dataset_btn": import_dataset_btn,
+        "search_type": search_type,
+        "search_value": search_value,
+        "instruction_display": instruction_display,
+        "repaint_viz_plot": repaint_viz_plot,
+        "item_info_json": item_info_json,
+        "item_src_audio": item_src_audio,
+        "get_item_btn": get_item_btn,
+        "item_target_audio": item_target_audio,
+        "item_refer_audio": item_refer_audio,
+        "use_src_checkbox": use_src_checkbox,
+        "data_status": data_status,
+        "auto_fill_btn": auto_fill_btn,
+    }
+def create_generation_section(handler) -> dict:
+    """Create generation section"""
+    with gr.Group():
+        gr.HTML('<div class="section-header"><h3>🎼 ACE-Step V1.5 Demo </h3></div>')
+        # Service Configuration
+        with gr.Accordion("🔧 Service Configuration", open=True) as service_config_accordion:
+            with gr.Row():
+                with gr.Column(scale=2):
+                    checkpoint_dropdown = gr.Dropdown(
+                        label="Checkpoint File",
+                        choices=handler.get_available_checkpoints(),
+                        value=None,
+                        info="Select a trained model checkpoint file (full path or filename)"
+                    )
+                with gr.Column(scale=1):
+                    refresh_btn = gr.Button("🔄 Refresh", size="sm")
+            with gr.Row():
+                # Get available acestep-v15- model list
+                available_models = handler.get_available_acestep_v15_models()
+                default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
+                config_path = gr.Dropdown(
+                    label="Main Model Path",
+                    choices=available_models,
+                    value=default_model,
+                    info="Select the model configuration directory (auto-scanned from checkpoints)"
+                )
+                device = gr.Dropdown(
+                    choices=["auto", "cuda", "cpu"],
+                    value="auto",
+                    label="Device",
+                    info="Processing device (auto-detect recommended)"
+                )
+            with gr.Row():
+                # Get available 5Hz LM model list
+                available_lm_models = handler.get_available_5hz_lm_models()
+                default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                lm_model_path = gr.Dropdown(
+                    label="5Hz LM Model Path",
+                    choices=available_lm_models,
+                    value=default_lm_model,
+                    info="Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)"
+                )
+                init_llm_checkbox = gr.Checkbox(
+                    label="Initialize 5Hz LM",
+                    value=False,
+                    info="Check to initialize 5Hz LM during service initialization"
+                )
+            with gr.Row():
+                # Auto-detect flash attention availability
+                flash_attn_available = handler.is_flash_attention_available()
+                use_flash_attention_checkbox = gr.Checkbox(
+                    label="Use Flash Attention",
+                    value=flash_attn_available,
+                    interactive=flash_attn_available,
+                    info="Enable flash attention for faster inference (requires flash_attn package)" if flash_attn_available else "Flash attention not available (flash_attn package not installed)"
+                )
+            init_btn = gr.Button("Initialize Service", variant="primary", size="lg")
+            init_status = gr.Textbox(label="Status", interactive=False, lines=3)
+        # Inputs
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Accordion("📝 Required Inputs", open=True):
+                    # Task type
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            task_type = gr.Dropdown(
+                                choices=["text2music", "repaint", "cover", "extract", "lego", "complete"],
+                                value="text2music",
+                                label="Task Type",
+                                info="Select the task type for generation",
+                            )
+                        with gr.Column(scale=8):
+                            instruction_display_gen = gr.Textbox(
+                                label="Instruction",
+                                value="Fill the audio semantic mask based on the given conditions:",
+                                interactive=False,
+                                lines=1,
+                                info="Instruction is automatically generated based on task type",
+                            )
+                    track_name = gr.Dropdown(
+                        choices=["woodwinds", "brass", "fx", "synth", "strings", "percussion",
+                                "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"],
+                        value=None,
+                        label="Track Name",
+                        info="Select track name for lego/extract tasks",
+                        visible=False
+                    )
+                    complete_track_classes = gr.CheckboxGroup(
+                        choices=["woodwinds", "brass", "fx", "synth", "strings", "percussion",
+                                "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"],
+                        label="Track Names",
+                        info="Select multiple track classes for complete task",
+                        visible=False
+                    )
+                    # Audio uploads
+                    with gr.Accordion("🎵 Audio Uploads", open=False):
+                        with gr.Row():
+                            with gr.Column(scale=2):
+                                reference_audio = gr.Audio(
+                                    label="Reference Audio (optional)",
+                                    type="filepath",
+                                )
+                            with gr.Column(scale=8):
+                                src_audio = gr.Audio(
+                                    label="Source Audio (optional)",
+                                    type="filepath",
+                                )
+                        audio_code_string = gr.Textbox(
+                            label="Audio Codes (optional)",
+                            placeholder="<|audio_code_10695|><|audio_code_54246|>...",
+                            lines=4,
+                            visible=False,
+                            info="Paste precomputed audio code tokens"
+                        )
+                    # Audio Codes for text2music
+                    with gr.Accordion("🎼 Audio Codes (for text2music)", open=True, visible=True) as text2music_audio_codes_group:
+                        text2music_audio_code_string = gr.Textbox(
+                            label="Audio Codes",
+                            placeholder="<|audio_code_10695|><|audio_code_54246|>...",
+                            lines=6,
+                            info="Paste precomputed audio code tokens for text2music generation"
+                        )
+                    # 5Hz LM
+                    with gr.Row(visible=False) as use_5hz_lm_row:
+                        use_5hz_lm_btn = gr.Button(
+                            "Generate LM Hints",
+                            variant="secondary",
+                            size="lg",
+                        )
+                        lm_temperature = gr.Slider(
+                            label="Temperature",
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=0.7,
+                            step=0.1,
+                            scale=2,
+                            info="Temperature for 5Hz LM sampling"
+                        )
+                    # Repainting controls
+                    with gr.Group(visible=False) as repainting_group:
+                        gr.HTML("<h5>🎨 Repainting Controls (seconds) </h5>")
+                        with gr.Row():
+                            repainting_start = gr.Number(
+                                label="Repainting Start",
+                                value=0.0,
+                                step=0.1,
+                            )
+                            repainting_end = gr.Number(
+                                label="Repainting End",
+                                value=-1,
+                                minimum=-1,
+                                step=0.1,
+                            )
+                    # Audio Cover Strength
+                    audio_cover_strength = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=1.0,
+                        step=0.01,
+                        label="Audio Cover Strength",
+                        info="Control how many denoising steps use cover mode",
+                        visible=False
+                    )
+                # Music Caption
+                with gr.Accordion("📝 Music Caption", open=True):
+                    captions = gr.Textbox(
+                        label="Music Caption (optional)",
+                        placeholder="A peaceful acoustic guitar melody with soft vocals...",
+                        lines=3,
+                        info="Describe the style, genre, instruments, and mood"
+                    )
+                # Lyrics
+                with gr.Accordion("📝 Lyrics", open=True):
+                    lyrics = gr.Textbox(
+                        label="Lyrics (optional)",
+                        placeholder="[Verse 1]\nUnder the starry night\nI feel so alive...",
+                        lines=8,
+                        info="Song lyrics with structure"
+                    )
+                # Optional Parameters
+                with gr.Accordion("⚙️ Optional Parameters", open=True):
+                    with gr.Row():
+                        vocal_language = gr.Dropdown(
+                            choices=["en", "zh", "ja", "ko", "es", "fr", "de"],
+                            value="en",
+                            label="Vocal Language (optional)",
+                            allow_custom_value=True
+                        )
+                        bpm = gr.Number(
+                            label="BPM (optional)",
+                            value=None,
+                            step=1,
+                            info="leave empty for N/A"
+                        )
+                        key_scale = gr.Textbox(
+                            label="Key/Scale (optional)",
+                            placeholder="Leave empty for N/A",
+                            value="",
+                        )
+                        time_signature = gr.Dropdown(
+                            choices=["2", "3", "4", "N/A", ""],
+                            value="4",
+                            label="Time Signature (optional)",
+                            allow_custom_value=True
+                        )
+                        audio_duration = gr.Number(
+                            label="Audio Duration (seconds)",
+                            value=-1,
+                            minimum=-1,
+                            maximum=600.0,
+                            step=0.1,
+                            info="Use -1 for random"
+                        )
+                        batch_size_input = gr.Number(
+                            label="Batch Size",
+                            value=1,
+                            minimum=1,
+                            maximum=8,
+                            step=1,
+                            info="Number of audio files to parallel generate"
+                        )
+        # Advanced Settings
+        with gr.Accordion("🔧 Advanced Settings", open=False):
+            with gr.Row():
+                inference_steps = gr.Slider(
+                    minimum=1,
+                    maximum=8,
+                    value=8,
+                    step=1,
+                    label="Inference Steps",
+                    info="Turbo: max 8, Base: max 100"
+                )
+                guidance_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=15.0,
+                    value=7.0,
+                    step=0.1,
+                    label="Guidance Scale",
+                    info="Higher values follow text more closely",
+                    visible=False
+                )
+                seed = gr.Textbox(
+                    label="Seed",
+                    value="-1",
+                    info="Use comma-separated values for batches"
+                )
+                random_seed_checkbox = gr.Checkbox(
+                    label="Random Seed",
+                    value=True,
+                    info="Enable to auto-generate seeds"
+                )
+            with gr.Row():
+                use_adg = gr.Checkbox(
+                    label="Use ADG",
+                    value=False,
+                    info="Enable Angle Domain Guidance",
+                    visible=False
+                )
+            with gr.Row():
+                cfg_interval_start = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.0,
+                    step=0.01,
+                    label="CFG Interval Start",
+                    visible=False
+                )
+                cfg_interval_end = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.01,
+                    label="CFG Interval End",
+                    visible=False
+                )
+            with gr.Row():
+                audio_format = gr.Dropdown(
+                    choices=["mp3", "flac"],
+                    value="mp3",
+                    label="Audio Format",
+                    info="Audio format for saved files"
+                )
+        generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=False)
+    return {
+        "checkpoint_dropdown": checkpoint_dropdown,
+        "refresh_btn": refresh_btn,
+        "config_path": config_path,
+        "device": device,
+        "init_btn": init_btn,
+        "init_status": init_status,
+        "lm_model_path": lm_model_path,
+        "init_llm_checkbox": init_llm_checkbox,
+        "use_flash_attention_checkbox": use_flash_attention_checkbox,
+        "task_type": task_type,
+        "instruction_display_gen": instruction_display_gen,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+        "reference_audio": reference_audio,
+        "src_audio": src_audio,
+        "audio_code_string": audio_code_string,
+        "text2music_audio_code_string": text2music_audio_code_string,
+        "text2music_audio_codes_group": text2music_audio_codes_group,
+        "use_5hz_lm_row": use_5hz_lm_row,
+        "use_5hz_lm_btn": use_5hz_lm_btn,
+        "lm_temperature": lm_temperature,
+        "repainting_group": repainting_group,
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "audio_cover_strength": audio_cover_strength,
+        "captions": captions,
+        "lyrics": lyrics,
+        "vocal_language": vocal_language,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "random_seed_checkbox": random_seed_checkbox,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "audio_format": audio_format,
+        "generate_btn": generate_btn,
+    }
+def create_results_section(handler) -> dict:
+    """Create results display section"""
+    with gr.Group():
+        gr.HTML('<div class="section-header"><h3>🎧 Generated Results</h3></div>')
+        status_output = gr.Textbox(label="Generation Status", interactive=False)
+        with gr.Row():
+            with gr.Column():
+                generated_audio_1 = gr.Audio(
+                    label="🎵 Generated Music (Sample 1)",
+                    type="filepath",
+                    interactive=False
+                )
+            with gr.Column():
+                generated_audio_2 = gr.Audio(
+                    label="🎵 Generated Music (Sample 2)",
+                    type="filepath",
+                    interactive=False
+                )
+        with gr.Accordion("📁 Batch Results & Generation Details", open=False):
+            generated_audio_batch = gr.File(
+                label="📁 All Generated Files (Download)",
+                file_count="multiple",
+                interactive=False
+            )
+            generation_info = gr.Markdown(label="Generation Details")
+        gr.Markdown("### ⚖️ Alignment Preference Analysis")
+        with gr.Row():
+            with gr.Column():
+                align_score_1 = gr.Textbox(label="Alignment Score (Sample 1)", interactive=False)
+                align_text_1 = gr.Textbox(label="Lyric Timestamps (Sample 1)", interactive=False, lines=10)
+                align_plot_1 = gr.Plot(label="Alignment Heatmap (Sample 1)")
+            with gr.Column():
+                align_score_2 = gr.Textbox(label="Alignment Score (Sample 2)", interactive=False)
+                align_text_2 = gr.Textbox(label="Lyric Timestamps (Sample 2)", interactive=False, lines=10)
+                align_plot_2 = gr.Plot(label="Alignment Heatmap (Sample 2)")
+    return {
+        "status_output": status_output,
+        "generated_audio_1": generated_audio_1,
+        "generated_audio_2": generated_audio_2,
+        "generated_audio_batch": generated_audio_batch,
+        "generation_info": generation_info,
+        "align_score_1": align_score_1,
+        "align_text_1": align_text_1,
+        "align_plot_1": align_plot_1,
+        "align_score_2": align_score_2,
+        "align_text_2": align_text_2,
+        "align_plot_2": align_plot_2,
+    }
+def setup_event_handlers(demo, handler, dataset_section, generation_section, results_section):
+    """Setup event handlers connecting UI components and business logic"""
+    def update_init_status(status_msg, enable_btn):
+        """Update initialization status and enable/disable generate button"""
+        return status_msg, gr.update(interactive=enable_btn)
+    # Dataset handlers
+    dataset_section["import_dataset_btn"].click(
+        fn=handler.import_dataset,
+        inputs=[dataset_section["dataset_type"]],
+        outputs=[dataset_section["data_status"]]
+    )
+    # Service initialization - refresh checkpoints
+    def refresh_checkpoints():
+        choices = handler.get_available_checkpoints()
+        return gr.update(choices=choices)
+    generation_section["refresh_btn"].click(
+        fn=refresh_checkpoints,
+        outputs=[generation_section["checkpoint_dropdown"]]
+    )
+    # Update UI based on model type (turbo vs base)
+    def update_model_type_settings(config_path):
+        """Update UI settings based on model type"""
+        if config_path is None:
+            config_path = ""
+        config_path_lower = config_path.lower()
+        if "turbo" in config_path_lower:
+            # Turbo model: max 8 steps, hide CFG/ADG
+            return (
+                gr.update(value=8, maximum=8, minimum=1),  # inference_steps
+                gr.update(visible=False),  # guidance_scale
+                gr.update(visible=False),  # use_adg
+                gr.update(visible=False),  # cfg_interval_start
+                gr.update(visible=False),  # cfg_interval_end
+            )
+        elif "base" in config_path_lower:
+            # Base model: max 100 steps, show CFG/ADG
+            return (
+                gr.update(value=32, maximum=100, minimum=1),  # inference_steps
+                gr.update(visible=True),  # guidance_scale
+                gr.update(visible=True),  # use_adg
+                gr.update(visible=True),  # cfg_interval_start
+                gr.update(visible=True),  # cfg_interval_end
+            )
+        else:
+            # Default to turbo settings
+            return (
+                gr.update(value=8, maximum=8, minimum=1),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            )
+    generation_section["config_path"].change(
+        fn=update_model_type_settings,
+        inputs=[generation_section["config_path"]],
+        outputs=[
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+        ]
+    )
+    # Service initialization
+    def init_service_wrapper(checkpoint, config_path, device, init_llm, lm_model_path, use_flash_attention):
+        """Wrapper for service initialization, returns status and button state"""
+        status, enable = handler.initialize_service(checkpoint, config_path, device, init_llm, lm_model_path, use_flash_attention)
+        return status, gr.update(interactive=enable)
+    generation_section["init_btn"].click(
+        fn=init_service_wrapper,
+        inputs=[
+            generation_section["checkpoint_dropdown"],
+            generation_section["config_path"],
+            generation_section["device"],
+            generation_section["init_llm_checkbox"],
+            generation_section["lm_model_path"],
+            generation_section["use_flash_attention_checkbox"],
+        ],
+        outputs=[generation_section["init_status"], generation_section["generate_btn"]]
+    )
+    # Generation with progress bar
+    def generate_with_progress(
+        captions, lyrics, bpm, key_scale, time_signature, vocal_language,
+        inference_steps, guidance_scale, random_seed_checkbox, seed,
+        reference_audio, audio_duration, batch_size_input, src_audio,
+        text2music_audio_code_string, repainting_start, repainting_end,
+        instruction_display_gen, audio_cover_strength, task_type,
+        use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
+        progress=gr.Progress(track_tqdm=True)
+    ):
+        return handler.generate_music(
+            captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
+            time_signature=time_signature, vocal_language=vocal_language,
+            inference_steps=inference_steps, guidance_scale=guidance_scale,
+            use_random_seed=random_seed_checkbox, seed=seed,
+            reference_audio=reference_audio, audio_duration=audio_duration,
+            batch_size=batch_size_input, src_audio=src_audio,
+            audio_code_string=text2music_audio_code_string,
+            repainting_start=repainting_start, repainting_end=repainting_end,
+            instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
+            task_type=task_type, use_adg=use_adg,
+            cfg_interval_start=cfg_interval_start, cfg_interval_end=cfg_interval_end,
+            audio_format=audio_format, lm_temperature=lm_temperature,
+            progress=progress
+        )
+    generation_section["generate_btn"].click(
+        fn=generate_with_progress,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"]
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["status_output"],
+            generation_section["seed"],
+            results_section["align_score_1"],
+            results_section["align_text_1"],
+            results_section["align_plot_1"],
+            results_section["align_score_2"],
+            results_section["align_text_2"],
+            results_section["align_plot_2"]
+        ]
+    )
+    # 5Hz LM generation (simplified version, can be extended as needed)
+    def generate_lm_hints_wrapper(caption, lyrics, temperature):
+        """Wrapper for 5Hz LM generation"""
+        metadata, audio_codes, status = handler.generate_with_5hz_lm(caption, lyrics, temperature)
+        # 返回格式化的结果，可以根据需要调整
+        result_text = f"Status: {status}\n\nMetadata: {metadata}\n\nAudio Codes: {audio_codes[:200]}..." if len(audio_codes) > 200 else f"Status: {status}\n\nMetadata: {metadata}\n\nAudio Codes: {audio_codes}"
+        return result_text
+    generation_section["use_5hz_lm_btn"].click(
+        fn=generate_lm_hints_wrapper,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["lm_temperature"]
+        ],
+        outputs=[generation_section["text2music_audio_code_string"]]
+    )

acestep/handler.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+"""
+Business Logic Handler
+Encapsulates all data processing and business logic as a bridge between model and UI
+"""
+import os
+import math
+import glob
+import tempfile
+import traceback
+import re
+import random
+from typing import Optional, Dict, Any, Tuple, List, Union
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.io.wavfile as wavfile
+import soundfile as sf
+import time
+from transformers import AutoTokenizer, AutoModel
+from diffusers.models import AutoencoderOobleck
+class AceStepHandler:
+    """ACE-Step Business Logic Handler"""
+    def __init__(self):
+        self.model = None
+        self.config = None
+        self.device = "cpu"
+        self.dtype = torch.float32  # Will be set based on device in initialize_service
+        self.temp_dir = tempfile.mkdtemp()
+        # VAE for audio encoding/decoding
+        self.vae = None
+        # Text encoder and tokenizer
+        self.text_encoder = None
+        self.text_tokenizer = None
+        # Silence latent for initialization
+        self.silence_latent = None
+        # Sample rate
+        self.sample_rate = 48000
+        # 5Hz LM related
+        self.lm_model = None
+        self.lm_tokenizer = None
+        self.lm_initialized = False
+        # Reward model (temporarily disabled)
+        self.reward_model = None
+        # Dataset related (temporarily disabled)
+        self.dataset = None
+        self.dataset_imported = False
+        # Batch size
+        self.batch_size = 2
+        # Custom layers config
+        self.custom_layers_config = {
+            2: [6, 7],
+            3: [10, 11],
+            4: [3],
+            5: [8, 9, 11],
+            6: [8]
+        }
+    def get_available_checkpoints(self) -> str:
+        """Return project root directory path"""
+        # Get project root (handler.py is in acestep/, so go up two levels to project root)
+        current_file = os.path.abspath(__file__)
+        project_root = os.path.dirname(os.path.dirname(current_file))
+        # default checkpoints
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        if os.path.exists(checkpoint_dir):
+            return [checkpoint_dir]
+        else:
+            return []
+    def get_available_acestep_v15_models(self) -> List[str]:
+        """Scan and return all model directory names starting with 'acestep-v15-'"""
+        # Get project root
+        current_file = os.path.abspath(__file__)
+        project_root = os.path.dirname(os.path.dirname(current_file))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        models = []
+        if os.path.exists(checkpoint_dir):
+            # Scan all directories starting with 'acestep-v15-' in checkpoints folder
+            for item in os.listdir(checkpoint_dir):
+                item_path = os.path.join(checkpoint_dir, item)
+                if os.path.isdir(item_path) and item.startswith("acestep-v15-"):
+                    models.append(item)
+        # Sort by name
+        models.sort()
+        return models
+    def get_available_5hz_lm_models(self) -> List[str]:
+        """Scan and return all model directory names starting with 'acestep-5Hz-lm-'"""
+        current_file = os.path.abspath(__file__)
+        project_root = os.path.dirname(os.path.dirname(current_file))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        models = []
+        if os.path.exists(checkpoint_dir):
+            for item in os.listdir(checkpoint_dir):
+                item_path = os.path.join(checkpoint_dir, item)
+                if os.path.isdir(item_path) and item.startswith("acestep-5Hz-lm-"):
+                    models.append(item)
+        models.sort()
+        return models
+    def is_flash_attention_available(self) -> bool:
+        """Check if flash attention is available on the system"""
+        try:
+            import flash_attn
+            return True
+        except ImportError:
+            return False
+    def initialize_service(
+        self,
+        project_root: str,
+        config_path: str,
+        device: str = "auto",
+        init_llm: bool = False,
+        lm_model_path: str = "acestep-5Hz-lm-0.6B",
+        use_flash_attention: bool = False,
+    ) -> Tuple[str, bool]:
+        """
+        Initialize model service
+        Args:
+            project_root: Project root path (may be checkpoints directory, will be handled automatically)
+            config_path: Model config directory name (e.g., "acestep-v15-turbo")
+            device: Device type
+            init_llm: Whether to initialize 5Hz LM model
+            lm_model_path: 5Hz LM model path
+            use_flash_attention: Whether to use flash attention (requires flash_attn package)
+        Returns:
+            (status_message, enable_generate_button)
+        """
+        try:
+            if device == "auto":
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.device = device
+            # Set dtype based on device: bfloat16 for cuda, float32 for cpu
+            self.dtype = torch.bfloat16 if device == "cuda" else torch.float32
+            # Auto-detect project root (independent of passed project_root parameter)
+            current_file = os.path.abspath(__file__)
+            actual_project_root = os.path.dirname(os.path.dirname(current_file))
+            checkpoint_dir = os.path.join(actual_project_root, "checkpoints")
+            # 1. Load main model
+            # config_path is relative path (e.g., "acestep-v15-turbo"), concatenate to checkpoints directory
+            acestep_v15_checkpoint_path = os.path.join(checkpoint_dir, config_path)
+            if os.path.exists(acestep_v15_checkpoint_path):
+                # Determine attention implementation
+                attn_implementation = "flash_attention_2" if use_flash_attention and self.is_flash_attention_available() else "eager"
+                self.model = AutoModel.from_pretrained(
+                    acestep_v15_checkpoint_path,
+                    trust_remote_code=True,
+                    attn_implementation=attn_implementation
+                )
+                self.config = self.model.config
+                # Move model to device and set dtype
+                self.model = self.model.to(device).to(self.dtype)
+                self.model.eval()
+                silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
+                if os.path.exists(silence_latent_path):
+                    self.silence_latent = torch.load(silence_latent_path).transpose(1, 2).squeeze(0)  # [L, C]
+                    self.silence_latent = self.silence_latent.to(device).to(self.dtype)
+                else:
+                    raise FileNotFoundError(f"Silence latent not found at {silence_latent_path}")
+            else:
+                raise FileNotFoundError(f"ACE-Step V1.5 checkpoint not found at {acestep_v15_checkpoint_path}")
+            # 2. Load VAE
+            vae_checkpoint_path = os.path.join(checkpoint_dir, "vae")
+            if os.path.exists(vae_checkpoint_path):
+                self.vae = AutoencoderOobleck.from_pretrained(vae_checkpoint_path)
+                self.vae = self.vae.to(device).to(self.dtype)
+                self.vae.eval()
+            else:
+                raise FileNotFoundError(f"VAE checkpoint not found at {vae_checkpoint_path}")
+            # 3. Load text encoder and tokenizer
+            text_encoder_path = os.path.join(checkpoint_dir, "Qwen3-Embedding-0.6B")
+            if os.path.exists(text_encoder_path):
+                self.text_tokenizer = AutoTokenizer.from_pretrained(text_encoder_path)
+                self.text_encoder = AutoModel.from_pretrained(text_encoder_path)
+                self.text_encoder = self.text_encoder.to(device).to(self.dtype)
+                self.text_encoder.eval()
+            else:
+                raise FileNotFoundError(f"Text encoder not found at {text_encoder_path}")
+            # 4. Load 5Hz LM model (optional, only if init_llm is True)
+            if init_llm:
+                full_lm_model_path = os.path.join(checkpoint_dir, lm_model_path)
+                if os.path.exists(full_lm_model_path):
+                    if device == "cuda":
+                        status_msg = self._initialize_5hz_lm_cuda(full_lm_model_path)
+                        if not self.llm_initialized:
+                            return status_msg, False
+                    self.llm = AutoModel.from_pretrained(full_lm_model_path)
+                    self.llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path)
+                else:
+                    # 5Hz LM path not found
+                    return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
+            # Determine actual attention implementation used
+            actual_attn = "flash_attention_2" if use_flash_attention and self.is_flash_attention_available() else "eager"
+            status_msg = f"✅ Model initialized successfully on {device}\n"
+            status_msg += f"Main model: {acestep_v15_checkpoint_path}\n"
+            status_msg += f"VAE: {vae_checkpoint_path}\n"
+            status_msg += f"Text encoder: {text_encoder_path}\n"
+            if init_llm and hasattr(self, 'llm') and self.llm is not None:
+                status_msg += f"5Hz LM model: {os.path.join(checkpoint_dir, lm_model_path)}\n"
+            else:
+                status_msg += f"5Hz LM model: Not loaded (checkbox not selected)\n"
+            status_msg += f"Dtype: {self.dtype}\n"
+            status_msg += f"Attention: {actual_attn}"
+            return status_msg, True
+        except Exception as e:
+            error_msg = f"❌ Error initializing model: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return error_msg, False
+    def import_dataset(self, dataset_type: str) -> str:
+        """Import dataset (temporarily disabled)"""
+        self.dataset_imported = False
+        return f"⚠️ Dataset import is currently disabled. Text2MusicDataset dependency not available."
+    def get_item_data(self, *args, **kwargs):
+        """Get dataset item (temporarily disabled)"""
+        return "", "", "", "", "", None, None, None, "❌ Dataset not available", "", 0, "", None, None, None, {}, "text2music"
+    def get_gpu_memory_utilization(self, minimal_gpu: float = 8, min_ratio: float = 0.2, max_ratio: float = 0.9) -> float:
+        """Get GPU memory utilization ratio"""
+        try:
+            device = torch.device("cuda:0")
+            total_gpu_mem_bytes = torch.cuda.get_device_properties(device).total_memory
+            allocated_mem_bytes = torch.cuda.memory_allocated(device)
+            reserved_mem_bytes = torch.cuda.memory_reserved(device)
+            total_gpu = total_gpu_mem_bytes / 1024**3
+            allocated_gpu = allocated_mem_bytes / 1024**3
+            reserved_gpu = reserved_mem_bytes / 1024**3
+            available_gpu = total_gpu - reserved_gpu
+            if available_gpu >= minimal_gpu:
+                ratio = min(max_ratio, max(min_ratio, minimal_gpu / total_gpu))
+            else:
+                ratio = min(max_ratio, max(min_ratio, (available_gpu * 0.8) / total_gpu))
+            return ratio
+        except Exception as e:
+            return 0.9
+    def _initialize_5hz_lm_cuda(self, model_path: str) -> str:
+        """Initialize 5Hz LM model"""
+        try:
+            from nanovllm import LLM, SamplingParams
+            if not torch.cuda.is_available():
+                return "❌ CUDA is not available. Please check your GPU setup."
+            current_device = torch.cuda.current_device()
+            device_name = torch.cuda.get_device_name(current_device)
+            torch.cuda.empty_cache()
+            gpu_memory_utilization = self.get_gpu_memory_utilization(
+                minimal_gpu=8,
+                min_ratio=0.2,
+                max_ratio=0.9
+            )
+            self.llm = LLM(
+                model=model_path,
+                enforce_eager=False,
+                tensor_parallel_size=1,
+                max_model_len=4096,
+                gpu_memory_utilization=gpu_memory_utilization,
+            )
+            self.llm_tokenizer = self.llm.tokenizer
+            self.llm_initialized = True
+            return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
+        except Exception as e:
+            self.llm_initialized = False
+            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return error_msg
+    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
+        """Generate metadata and audio codes using 5Hz LM"""
+        if not self.lm_initialized or self.llm is None:
+            return {}, "", "❌ 5Hz LM not initialized. Please initialize it first."
+        try:
+            from nanovllm import SamplingParams
+            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+            formatted_prompt = self.lm_tokenizer.apply_chat_template(
+                [
+                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                    {"role": "user", "content": prompt}
+                ],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            sampling_params = SamplingParams(max_tokens=3072, temperature=temperature)
+            outputs = self.llm.generate([formatted_prompt], sampling_params)
+            if isinstance(outputs, list) and len(outputs) > 0:
+                if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
+                    output_text = outputs[0].outputs[0].text
+                elif hasattr(outputs[0], 'text'):
+                    output_text = outputs[0].text
+                else:
+                    output_text = str(outputs[0])
+            else:
+                output_text = str(outputs)
+            metadata, audio_codes = self.parse_lm_output(output_text)
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
+        except Exception as e:
+            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return {}, "", error_msg
+    def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
+        """Parse LM output"""
+        metadata = {}
+        audio_codes = ""
+        import re
+        # Extract audio codes
+        code_pattern = r'<\|audio_code_\d+\|>'
+        code_matches = re.findall(code_pattern, output_text)
+        if code_matches:
+            audio_codes = "".join(code_matches)
+        # Extract metadata
+        reasoning_patterns = [
+            r'<think>(.*?)</think>',
+            r'<reasoning>(.*?)</reasoning>',
+        ]
+        reasoning_text = None
+        for pattern in reasoning_patterns:
+            match = re.search(pattern, output_text, re.DOTALL)
+            if match:
+                reasoning_text = match.group(1).strip()
+                break
+        if not reasoning_text:
+            lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
+            reasoning_text = lines_before_codes.strip()
+        # Parse metadata fields
+        if reasoning_text:
+            for line in reasoning_text.split('\n'):
+                line = line.strip()
+                if ':' in line and not line.startswith('<'):
+                    parts = line.split(':', 1)
+                    if len(parts) == 2:
+                        key = parts[0].strip().lower()
+                        value = parts[1].strip()
+                        if key == 'bpm':
+                            try:
+                                metadata['bpm'] = int(value)
+                            except:
+                                metadata['bpm'] = value
+                        elif key == 'duration':
+                            try:
+                                metadata['duration'] = int(value)
+                            except:
+                                metadata['duration'] = value
+                        elif key in ['genres', 'keyscale', 'timesignature']:
+                            metadata[key] = value
+        return metadata, audio_codes
+    def process_reference_audio(self, audio_file) -> Optional[torch.Tensor]:
+        """Process reference audio"""
+        if audio_file is None:
+            return None
+        try:
+            # Load audio using soundfile
+            audio_np, sr = sf.read(audio_file, dtype='float32')
+            # Convert to torch: [samples, channels] or [samples] -> [channels, samples]
+            if audio_np.ndim == 1:
+                audio = torch.from_numpy(audio_np).unsqueeze(0)
+            else:
+                audio = torch.from_numpy(audio_np.T)
+            if audio.shape[0] == 1:
+                audio = torch.cat([audio, audio], dim=0)
+            audio = audio[:2]
+            # Resample if needed
+            if sr != 48000:
+                import torch.nn.functional as F
+                # Simple resampling using interpolate
+                ratio = 48000 / sr
+                new_length = int(audio.shape[-1] * ratio)
+                audio = F.interpolate(audio.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
+            audio = torch.clamp(audio, -1.0, 1.0)
+            target_frames = 30 * 48000
+            if audio.shape[-1] > target_frames:
+                start_frame = (audio.shape[-1] - target_frames) // 2
+                audio = audio[:, start_frame:start_frame + target_frames]
+            elif audio.shape[-1] < target_frames:
+                audio = torch.nn.functional.pad(
+                    audio, (0, target_frames - audio.shape[-1]), 'constant', 0
+                )
+            return audio
+        except Exception as e:
+            print(f"Error processing reference audio: {e}")
+            return None
+    def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
+        """Process target audio"""
+        if audio_file is None:
+            return None
+        try:
+            # Load audio using soundfile
+            audio_np, sr = sf.read(audio_file, dtype='float32')
+            # Convert to torch: [samples, channels] or [samples] -> [channels, samples]
+            if audio_np.ndim == 1:
+                audio = torch.from_numpy(audio_np).unsqueeze(0)
+            else:
+                audio = torch.from_numpy(audio_np.T)
+            if audio.shape[0] == 1:
+                audio = torch.cat([audio, audio], dim=0)
+            audio = audio[:2]
+            # Resample if needed
+            if sr != 48000:
+                import torch.nn.functional as F
+                ratio = 48000 / sr
+                new_length = int(audio.shape[-1] * ratio)
+                audio = F.interpolate(audio.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
+            audio = torch.clamp(audio, -1.0, 1.0)
+            return audio
+        except Exception as e:
+            print(f"Error processing target audio: {e}")
+            return None
+    def _parse_audio_code_string(self, code_str: str) -> List[int]:
+        """Extract integer audio codes from prompt tokens like <|audio_code_123|>."""
+        if not code_str:
+            return []
+        try:
+            return [int(x) for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str)]
+        except Exception:
+            return []
+    def _decode_audio_codes_to_latents(self, code_str: str) -> Optional[torch.Tensor]:
+        """
+        Convert serialized audio code string into 25Hz latents using model quantizer/detokenizer.
+        """
+        if not self.model or not hasattr(self.model, 'tokenizer') or not hasattr(self.model, 'detokenizer'):
+            return None
+        code_ids = self._parse_audio_code_string(code_str)
+        if len(code_ids) == 0:
+            return None
+        quantizer = self.model.tokenizer.quantizer
+        detokenizer = self.model.detokenizer
+        num_quantizers = getattr(quantizer, "num_quantizers", 1)
+        indices = torch.tensor(code_ids, device=self.device, dtype=torch.long).unsqueeze(0)  # [1, T_5Hz]
+        # Expand to include quantizer dimension: [1, T_5Hz, num_quantizers]
+        if indices.dim() == 2:
+            indices = indices.unsqueeze(-1).expand(-1, -1, num_quantizers)
+        # Get quantized representation from indices: [1, T_5Hz, dim]
+        quantized = quantizer.get_output_from_indices(indices)
+        if quantized.dtype != self.dtype:
+            quantized = quantized.to(self.dtype)
+        # Detokenize to 25Hz: [1, T_5Hz, dim] -> [1, T_25Hz, dim]
+        lm_hints_25hz = detokenizer(quantized)
+        return lm_hints_25hz
+    def _create_default_meta(self) -> str:
+        """Create default metadata string."""
+        return (
+            "- bpm: N/A\n"
+            "- timesignature: N/A\n"
+            "- keyscale: N/A\n"
+            "- duration: 30 seconds\n"
+        )
+    def _dict_to_meta_string(self, meta_dict: Dict[str, Any]) -> str:
+        """Convert metadata dict to formatted string."""
+        bpm = meta_dict.get('bpm', meta_dict.get('tempo', 'N/A'))
+        timesignature = meta_dict.get('timesignature', meta_dict.get('time_signature', 'N/A'))
+        keyscale = meta_dict.get('keyscale', meta_dict.get('key', meta_dict.get('scale', 'N/A')))
+        duration = meta_dict.get('duration', meta_dict.get('length', 30))
+        # Format duration
+        if isinstance(duration, (int, float)):
+            duration = f"{int(duration)} seconds"
+        elif not isinstance(duration, str):
+            duration = "30 seconds"
+        return (
+            f"- bpm: {bpm}\n"
+            f"- timesignature: {timesignature}\n"
+            f"- keyscale: {keyscale}\n"
+            f"- duration: {duration}\n"
+        )
+    def _parse_metas(self, metas: List[Union[str, Dict[str, Any]]]) -> List[str]:
+        """Parse and normalize metadata with fallbacks."""
+        parsed_metas = []
+        for meta in metas:
+            if meta is None:
+                parsed_meta = self._create_default_meta()
+            elif isinstance(meta, str):
+                parsed_meta = meta
+            elif isinstance(meta, dict):
+                parsed_meta = self._dict_to_meta_string(meta)
+            else:
+                parsed_meta = self._create_default_meta()
+            parsed_metas.append(parsed_meta)
+        return parsed_metas
+    def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get text hidden states from text encoder."""
+        if self.text_tokenizer is None or self.text_encoder is None:
+            raise ValueError("Text encoder not initialized")
+        # Tokenize
+        text_inputs = self.text_tokenizer(
+            text_prompt,
+            padding="longest",
+            truncation=True,
+            max_length=256,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.device)
+        text_attention_mask = text_inputs.attention_mask.to(self.device).bool()
+        # Encode
+        with torch.no_grad():
+            text_outputs = self.text_encoder(text_input_ids)
+            if hasattr(text_outputs, 'last_hidden_state'):
+                text_hidden_states = text_outputs.last_hidden_state
+            elif isinstance(text_outputs, tuple):
+                text_hidden_states = text_outputs[0]
+            else:
+                text_hidden_states = text_outputs
+        text_hidden_states = text_hidden_states.to(self.dtype)
+        return text_hidden_states, text_attention_mask
+    def extract_caption_from_sft_format(self, caption: str) -> str:
+        """Extract caption from SFT format if needed."""
+        # Simple extraction - can be enhanced if needed
+        if caption and isinstance(caption, str):
+            return caption.strip()
+        return caption if caption else ""
+    def generate_music(
+        self,
+        captions: str,
+        lyrics: str,
+        bpm: Optional[int] = None,
+        key_scale: str = "",
+        time_signature: str = "",
+        vocal_language: str = "en",
+        inference_steps: int = 8,
+        guidance_scale: float = 7.0,
+        use_random_seed: bool = True,
+        seed: Optional[Union[str, float, int]] = -1,
+        reference_audio=None,
+        audio_duration: Optional[float] = None,
+        batch_size: Optional[int] = None,
+        src_audio=None,
+        audio_code_string: str = "",
+        repainting_start: float = 0.0,
+        repainting_end: Optional[float] = None,
+        instruction: str = "Fill the audio semantic mask based on the given conditions:",
+        audio_cover_strength: float = 1.0,
+        task_type: str = "text2music",
+        use_adg: bool = False,
+        cfg_interval_start: float = 0.0,
+        cfg_interval_end: float = 1.0,
+        audio_format: str = "mp3",
+        lm_temperature: float = 0.6,
+        progress=None
+    ) -> Tuple[Optional[str], Optional[str], List[str], str, str, str, str, str, Optional[Any], str, str, Optional[Any]]:
+        """
+        Main interface for music generation
+        Returns:
+            (first_audio, second_audio, all_audio_paths, generation_info, status_message,
+             seed_value_for_ui, align_score_1, align_text_1, align_plot_1,
+             align_score_2, align_text_2, align_plot_2)
+        """
+        if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
+            return None, None, [], "", "❌ Model not fully initialized. Please initialize all components first.", "-1", "", "", None, "", "", None
+        try:
+            print("[generate_music] Starting generation...")
+            if progress:
+                progress(0.05, desc="Preparing inputs...")
+            print("[generate_music] Preparing inputs...")
+            # Determine actual batch size
+            actual_batch_size = batch_size if batch_size is not None else self.batch_size
+            actual_batch_size = max(1, min(actual_batch_size, 8))  # Limit to 8 for memory safety
+            # Process seeds
+            if use_random_seed:
+                seed_list = [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size)]
+            else:
+                # Parse seed input
+                if isinstance(seed, str):
+                    seed_parts = [s.strip() for s in seed.split(",")]
+                    seed_list = [int(float(s)) if s != "-1" and s else random.randint(0, 2**32 - 1) for s in seed_parts[:actual_batch_size]]
+                elif isinstance(seed, (int, float)) and seed >= 0:
+                    seed_list = [int(seed)] * actual_batch_size
+                else:
+                    seed_list = [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size)]
+                # Pad if needed
+                while len(seed_list) < actual_batch_size:
+                    seed_list.append(random.randint(0, 2**32 - 1))
+            seed_value_for_ui = ", ".join(str(s) for s in seed_list)
+            # Process audio inputs
+            processed_ref_audio = self.process_reference_audio(reference_audio) if reference_audio else None
+            processed_src_audio = self.process_target_audio(src_audio) if src_audio else None
+            # Extract caption
+            pure_caption = self.extract_caption_from_sft_format(captions)
+            # Determine task type and update instruction if needed
+            if task_type == "text2music" and audio_code_string and str(audio_code_string).strip():
+                task_type = "cover"
+                instruction = "Generate audio semantic tokens based on the given conditions:"
+            # Build metadata
+            metadata_dict = {
+                "bpm": bpm if bpm else "N/A",
+                "keyscale": key_scale if key_scale else "N/A",
+                "timesignature": time_signature if time_signature else "N/A",
+            }
+            # Calculate duration
+            if processed_src_audio is not None:
+                calculated_duration = processed_src_audio.shape[-1] / self.sample_rate
+            elif audio_duration is not None and audio_duration > 0:
+                calculated_duration = audio_duration
+            else:
+                calculated_duration = 30.0  # Default 30 seconds
+            metadata_dict["duration"] = f"{int(calculated_duration)} seconds"
+            if progress:
+                progress(0.1, desc="Processing audio inputs...")
+            print("[generate_music] Processing audio inputs...")
+            # Prepare batch data
+            captions_batch = [pure_caption] * actual_batch_size
+            lyrics_batch = [lyrics] * actual_batch_size
+            vocal_languages_batch = [vocal_language] * actual_batch_size
+            instructions_batch = [instruction] * actual_batch_size
+            metas_batch = [metadata_dict.copy()] * actual_batch_size
+            audio_code_hints_batch = [audio_code_string if audio_code_string else None] * actual_batch_size
+            # Process reference audios
+            if processed_ref_audio is not None:
+                refer_audios = [[processed_ref_audio] for _ in range(actual_batch_size)]
+            else:
+                # Create silence as fallback
+                silence_frames = 30 * self.sample_rate
+                silence = torch.zeros(2, silence_frames)
+                refer_audios = [[silence] for _ in range(actual_batch_size)]
+            # Process target wavs (src_audio)
+            if processed_src_audio is not None:
+                target_wavs_list = [processed_src_audio.clone() for _ in range(actual_batch_size)]
+            else:
+                # Create silence based on duration
+                target_frames = int(calculated_duration * self.sample_rate)
+                silence = torch.zeros(2, target_frames)
+                target_wavs_list = [silence for _ in range(actual_batch_size)]
+            # Pad target_wavs to consistent length
+            max_target_frames = max(wav.shape[-1] for wav in target_wavs_list)
+            target_wavs = torch.stack([
+                torch.nn.functional.pad(wav, (0, max_target_frames - wav.shape[-1]), 'constant', 0)
+                for wav in target_wavs_list
+            ])
+            if progress:
+                progress(0.2, desc="Encoding audio to latents...")
+            print("[generate_music] Encoding audio to latents...")
+            # Encode target_wavs to latents using VAE
+            target_latents_list = []
+            latent_lengths = []
+            with torch.no_grad():
+                for i in range(actual_batch_size):
+                    # Check if audio codes are provided
+                    code_hint = audio_code_hints_batch[i]
+                    if code_hint:
+                        decoded_latents = self._decode_audio_codes_to_latents(code_hint)
+                        if decoded_latents is not None:
+                            decoded_latents = decoded_latents.squeeze(0)  # Remove batch dim
+                            target_latents_list.append(decoded_latents)
+                            latent_lengths.append(decoded_latents.shape[0])
+                            continue
+                    # If no src_audio provided, use silence_latent directly (skip VAE)
+                    if processed_src_audio is None:
+                        # Calculate required latent length based on duration
+                        # VAE downsample ratio is 1920 (2*4*4*6*10), so latent rate is 48000/1920 = 25Hz
+                        latent_length = int(calculated_duration * 25)  # 25Hz latent rate
+                        latent_length = max(128, latent_length)  # Minimum 128
+                        # Tile silence_latent to required length
+                        if self.silence_latent.shape[0] >= latent_length:
+                            target_latent = self.silence_latent[:latent_length].to(self.device).to(self.dtype)
+                        else:
+                            repeat_times = (latent_length // self.silence_latent.shape[0]) + 1
+                            target_latent = self.silence_latent.repeat(repeat_times, 1)[:latent_length].to(self.device).to(self.dtype)
+                        target_latents_list.append(target_latent)
+                        latent_lengths.append(target_latent.shape[0])
+                        continue
+                    # Encode from audio using VAE
+                    current_wav = target_wavs[i].unsqueeze(0).to(self.device).to(self.dtype)
+                    target_latent = self.vae.encode(current_wav)
+                    target_latent = target_latent.squeeze(0).transpose(0, 1)  # [latent_length, latent_dim]
+                    target_latents_list.append(target_latent)
+                    latent_lengths.append(target_latent.shape[0])
+                # Pad latents to same length
+                max_latent_length = max(latent_lengths)
+                max_latent_length = max(128, max_latent_length)  # Minimum 128
+                padded_latents = []
+                for i, latent in enumerate(target_latents_list):
+                    if latent.shape[0] < max_latent_length:
+                        pad_length = max_latent_length - latent.shape[0]
+                        # Tile silence_latent to pad_length (silence_latent is [L, C])
+                        if self.silence_latent.shape[0] >= pad_length:
+                            pad_latent = self.silence_latent[:pad_length]
+                        else:
+                            repeat_times = (pad_length // self.silence_latent.shape[0]) + 1
+                            pad_latent = self.silence_latent.repeat(repeat_times, 1)[:pad_length]
+                        latent = torch.cat([latent, pad_latent.to(self.device).to(self.dtype)], dim=0)
+                    padded_latents.append(latent)
+                target_latents = torch.stack(padded_latents).to(self.device).to(self.dtype)
+                latent_masks = torch.stack([
+                    torch.cat([
+                        torch.ones(l, dtype=torch.long, device=self.device),
+                        torch.zeros(max_latent_length - l, dtype=torch.long, device=self.device)
+                    ])
+                    for l in latent_lengths
+                ])
+            if progress:
+                progress(0.3, desc="Preparing conditions...")
+            print("[generate_music] Preparing conditions...")
+            # Determine task type and create chunk masks
+            is_covers = []
+            chunk_masks = []
+            repainting_ranges = {}
+            for i in range(actual_batch_size):
+                has_code_hint = audio_code_hints_batch[i] is not None
+                has_repainting = (repainting_end is not None and repainting_end > repainting_start)
+                if has_repainting:
+                    # Repainting mode
+                    start_sec = max(0, repainting_start)
+                    end_sec = repainting_end if repainting_end is not None else calculated_duration
+                    start_latent = int(start_sec * self.sample_rate // 1920)
+                    end_latent = int(end_sec * self.sample_rate // 1920)
+                    start_latent = max(0, min(start_latent, max_latent_length - 1))
+                    end_latent = max(start_latent + 1, min(end_latent, max_latent_length))
+                    mask = torch.zeros(max_latent_length, dtype=torch.bool, device=self.device)
+                    mask[start_latent:end_latent] = True
+                    chunk_masks.append(mask)
+                    repainting_ranges[i] = (start_latent, end_latent)
+                    is_covers.append(False)
+                else:
+                    # Full generation or cover
+                    chunk_masks.append(torch.ones(max_latent_length, dtype=torch.bool, device=self.device))
+                    # Check if cover task
+                    instruction_lower = instructions_batch[i].lower()
+                    is_cover = ("generate audio semantic tokens" in instruction_lower and
+                               "based on the given conditions" in instruction_lower) or has_code_hint
+                    is_covers.append(is_cover)
+            chunk_masks = torch.stack(chunk_masks).unsqueeze(-1).expand(-1, -1, 64)  # [batch, length, 64]
+            is_covers = torch.tensor(is_covers, dtype=torch.bool, device=self.device)
+            # Create src_latents
+            # Tile silence_latent to max_latent_length (silence_latent is now [L, C])
+            if self.silence_latent.shape[0] >= max_latent_length:
+                silence_latent_tiled = self.silence_latent[:max_latent_length].to(self.device).to(self.dtype)
+            else:
+                repeat_times = (max_latent_length // self.silence_latent.shape[0]) + 1
+                silence_latent_tiled = self.silence_latent.repeat(repeat_times, 1)[:max_latent_length].to(self.device).to(self.dtype)
+            src_latents_list = []
+            for i in range(actual_batch_size):
+                has_target_audio = (target_wavs[i].abs().sum() > 1e-6) or (audio_code_hints_batch[i] is not None)
+                if has_target_audio:
+                    if i in repainting_ranges:
+                        # Repaint: replace inpainting region with silence
+                        src_latent = target_latents[i].clone()
+                        start_latent, end_latent = repainting_ranges[i]
+                        src_latent[start_latent:end_latent] = silence_latent_tiled[start_latent:end_latent]
+                        src_latents_list.append(src_latent)
+                    else:
+                        # Cover/extract/complete/lego: use target_latents
+                        src_latents_list.append(target_latents[i].clone())
+                else:
+                    # Text2music: use silence
+                    src_latents_list.append(silence_latent_tiled.clone())
+            src_latents = torch.stack(src_latents_list)  # [batch, length, channels]
+            if progress:
+                progress(0.4, desc="Tokenizing text inputs...")
+            print("[generate_music] Tokenizing text inputs...")
+            # Prepare text and lyric hidden states
+            SFT_GEN_PROMPT = """# Instruction
+{}
+# Caption
+{}
+# Metas
+{}<|endoftext|>
+"""
+            text_hidden_states_list = []
+            text_attention_masks_list = []
+            lyric_hidden_states_list = []
+            lyric_attention_masks_list = []
+            with torch.no_grad():
+                for i in range(actual_batch_size):
+                    # Format text prompt
+                    inst = instructions_batch[i]
+                    if not inst.endswith(":"):
+                        inst = inst + ":"
+                    meta_str = self._dict_to_meta_string(metas_batch[i])
+                    text_prompt = SFT_GEN_PROMPT.format(inst, captions_batch[i], meta_str)
+                    # Tokenize and encode text
+                    text_hidden, text_mask = self._get_text_hidden_states(text_prompt)
+                    text_hidden_states_list.append(text_hidden.squeeze(0))
+                    text_attention_masks_list.append(text_mask.squeeze(0))
+                    # Format and tokenize lyrics
+                    lyrics_text = f"# Languages\n{vocal_languages_batch[i]}\n\n# Lyric\n{lyrics_batch[i]}<|endoftext|>"
+                    lyric_hidden, lyric_mask = self._get_text_hidden_states(lyrics_text)
+                    lyric_hidden_states_list.append(lyric_hidden.squeeze(0))
+                    lyric_attention_masks_list.append(lyric_mask.squeeze(0))
+                # Pad sequences
+                max_text_length = max(h.shape[0] for h in text_hidden_states_list)
+                max_lyric_length = max(h.shape[0] for h in lyric_hidden_states_list)
+                text_hidden_states = torch.stack([
+                    torch.nn.functional.pad(h, (0, 0, 0, max_text_length - h.shape[0]), 'constant', 0)
+                    for h in text_hidden_states_list
+                ]).to(self.device).to(self.dtype)
+                text_attention_mask = torch.stack([
+                    torch.nn.functional.pad(m, (0, max_text_length - m.shape[0]), 'constant', 0)
+                    for m in text_attention_masks_list
+                ]).to(self.device)
+                lyric_hidden_states = torch.stack([
+                    torch.nn.functional.pad(h, (0, 0, 0, max_lyric_length - h.shape[0]), 'constant', 0)
+                    for h in lyric_hidden_states_list
+                ]).to(self.device).to(self.dtype)
+                lyric_attention_mask = torch.stack([
+                    torch.nn.functional.pad(m, (0, max_lyric_length - m.shape[0]), 'constant', 0)
+                    for m in lyric_attention_masks_list
+                ]).to(self.device)
+            if progress:
+                progress(0.5, desc="Processing reference audio...")
+            print("[generate_music] Processing reference audio...")
+            # Process reference audio for timbre
+            # Model expects: refer_audio_acoustic_hidden_states_packed [N, timbre_fix_frame, audio_acoustic_hidden_dim]
+            #                refer_audio_order_mask [N] indicating batch assignment
+            timbre_fix_frame = getattr(self.config, 'timbre_fix_frame', 750)
+            refer_audio_acoustic_hidden_states_packed_list = []
+            refer_audio_order_mask_list = []
+            with torch.no_grad():
+                for i, ref_audio_list in enumerate(refer_audios):
+                    if ref_audio_list and len(ref_audio_list) > 0 and ref_audio_list[0].abs().sum() > 1e-6:
+                        # Encode reference audio: [channels, samples] -> [1, latent_dim, T] -> [T, latent_dim]
+                        ref_audio = ref_audio_list[0].unsqueeze(0).to(self.device).to(self.dtype)
+                        ref_latent = self.vae.encode(ref_audio).latent_dist.sample()  # [1, latent_dim, T]
+                        ref_latent = ref_latent.squeeze(0).transpose(0, 1)  # [T, latent_dim]
+                        # Ensure dimension matches audio_acoustic_hidden_dim (64)
+                        if ref_latent.shape[-1] != self.config.audio_acoustic_hidden_dim:
+                            ref_latent = ref_latent[:, :self.config.audio_acoustic_hidden_dim]
+                        # Pad or truncate to timbre_fix_frame
+                        if ref_latent.shape[0] < timbre_fix_frame:
+                            pad_length = timbre_fix_frame - ref_latent.shape[0]
+                            padding = torch.zeros(pad_length, ref_latent.shape[1], device=self.device, dtype=self.dtype)
+                            ref_latent = torch.cat([ref_latent, padding], dim=0)
+                        else:
+                            ref_latent = ref_latent[:timbre_fix_frame]
+                        refer_audio_acoustic_hidden_states_packed_list.append(ref_latent)
+                        refer_audio_order_mask_list.append(i)
+                    else:
+                        # Use silence_latent directly instead of running VAE
+                        if self.silence_latent.shape[0] >= timbre_fix_frame:
+                            silence_ref = self.silence_latent[:timbre_fix_frame, :self.config.audio_acoustic_hidden_dim]
+                        else:
+                            repeat_times = (timbre_fix_frame // self.silence_latent.shape[0]) + 1
+                            silence_ref = self.silence_latent.repeat(repeat_times, 1)[:timbre_fix_frame, :self.config.audio_acoustic_hidden_dim]
+                        refer_audio_acoustic_hidden_states_packed_list.append(silence_ref.to(self.device).to(self.dtype))
+                        refer_audio_order_mask_list.append(i)
+                # Stack all reference audios: [N, timbre_fix_frame, audio_acoustic_hidden_dim]
+                refer_audio_acoustic_hidden_states_packed = torch.stack(refer_audio_acoustic_hidden_states_packed_list, dim=0).to(self.device).to(self.dtype)
+                # Order mask: [N] indicating which batch item each reference belongs to
+                refer_audio_order_mask = torch.tensor(refer_audio_order_mask_list, dtype=torch.long, device=self.device)
+            if progress:
+                progress(0.6, desc="Generating audio...")
+            print("[generate_music] Calling model.generate_audio()...")
+            print(f"  - text_hidden_states: {text_hidden_states.shape}, dtype={text_hidden_states.dtype}")
+            print(f"  - text_attention_mask: {text_attention_mask.shape}, dtype={text_attention_mask.dtype}")
+            print(f"  - lyric_hidden_states: {lyric_hidden_states.shape}, dtype={lyric_hidden_states.dtype}")
+            print(f"  - lyric_attention_mask: {lyric_attention_mask.shape}, dtype={lyric_attention_mask.dtype}")
+            print(f"  - refer_audio_acoustic_hidden_states_packed: {refer_audio_acoustic_hidden_states_packed.shape}, dtype={refer_audio_acoustic_hidden_states_packed.dtype}")
+            print(f"  - refer_audio_order_mask: {refer_audio_order_mask.shape}, dtype={refer_audio_order_mask.dtype}")
+            print(f"  - src_latents: {src_latents.shape}, dtype={src_latents.dtype}")
+            print(f"  - chunk_masks: {chunk_masks.shape}, dtype={chunk_masks.dtype}")
+            print(f"  - is_covers: {is_covers.shape}, dtype={is_covers.dtype}")
+            print(f"  - silence_latent: {self.silence_latent.unsqueeze(0).shape}")
+            print(f"  - seed: {seed_list[0] if len(seed_list) > 0 else None}")
+            print(f"  - fix_nfe: {inference_steps}")
+            # Call model to generate
+            with torch.no_grad():
+                outputs = self.model.generate_audio(
+                    text_hidden_states=text_hidden_states,
+                    text_attention_mask=text_attention_mask,
+                    lyric_hidden_states=lyric_hidden_states,
+                    lyric_attention_mask=lyric_attention_mask,
+                    refer_audio_acoustic_hidden_states_packed=refer_audio_acoustic_hidden_states_packed,
+                    refer_audio_order_mask=refer_audio_order_mask,
+                    src_latents=src_latents,
+                    chunk_masks=chunk_masks,
+                    is_covers=is_covers,
+                    silence_latent=self.silence_latent.unsqueeze(0),  # [1, L, C]
+                    seed=seed_list[0] if len(seed_list) > 0 else None,
+                    fix_nfe=inference_steps,
+                    infer_method="ode",
+                    use_cache=True,
+                )
+            print("[generate_music] Model generation completed. Decoding latents...")
+            pred_latents = outputs["target_latents"]  # [batch, latent_length, latent_dim]
+            time_costs = outputs["time_costs"]
+            print(f"  - pred_latents: {pred_latents.shape}, dtype={pred_latents.dtype} {pred_latents.min()=}, {pred_latents.max()=}, {pred_latents.mean()=} {pred_latents.std()=}")
+            print(f"  - time_costs: {time_costs}")
+            if progress:
+                progress(0.8, desc="Decoding audio...")
+            print("[generate_music] Decoding latents with VAE...")
+            # Decode latents to audio
+            start_time = time.time()
+            with torch.no_grad():
+                # Transpose for VAE decode: [batch, latent_length, latent_dim] -> [batch, latent_dim, latent_length]
+                pred_latents_for_decode = pred_latents.transpose(1, 2)
+                pred_wavs = self.vae.decode(pred_latents_for_decode).sample  # [batch, channels, samples]
+            end_time = time.time()
+            time_costs["vae_decode_time_cost"] = end_time - start_time
+            time_costs["total_time_cost"] = time_costs["total_time_cost"] + time_costs["vae_decode_time_cost"]
+            print("[generate_music] VAE decode completed. Saving audio files...")
+            if progress:
+                progress(0.9, desc="Saving audio files...")
+            # Save audio files using soundfile (supports wav, flac, mp3 via format param)
+            audio_format_lower = audio_format.lower() if audio_format else "wav"
+            if audio_format_lower not in ["wav", "flac", "mp3"]:
+                audio_format_lower = "wav"
+            saved_files = []
+            for i in range(actual_batch_size):
+                audio_file = os.path.join(self.temp_dir, f"generated_{i}_{seed_list[i]}.{audio_format_lower}")
+                # Convert to numpy: [channels, samples] -> [samples, channels]
+                audio_np = pred_wavs[i].cpu().float().numpy().T
+                sf.write(audio_file, audio_np, self.sample_rate)
+                saved_files.append(audio_file)
+            # Prepare return values
+            first_audio = saved_files[0] if len(saved_files) > 0 else None
+            second_audio = saved_files[1] if len(saved_files) > 1 else None
+            # Format time costs if available
+            time_costs_str = ""
+            if time_costs:
+                if isinstance(time_costs, dict):
+                    time_costs_str = "\n\n**⏱️ Time Costs:**\n"
+                    for key, value in time_costs.items():
+                        # Format key: encoder_time_cost -> Encoder
+                        formatted_key = key.replace("_time_cost", "").replace("_", " ").title()
+                        time_costs_str += f"  - {formatted_key}: {value:.2f}s\n"
+                elif isinstance(time_costs, (int, float)):
+                    time_costs_str = f"\n\n**⏱️ Time Cost:** {time_costs:.2f}s"
+            generation_info = f"""**🎵 Generation Complete**
+**Seeds:** {seed_value_for_ui}
+**Duration:** {calculated_duration:.1f}s
+**Steps:** {inference_steps}
+**Files:** {len(saved_files)} audio(s){time_costs_str}"""
+            status_message = f"✅ Generation completed successfully!"
+            print(f"[generate_music] Done! Generated {len(saved_files)} audio files.")
+            # Alignment scores and plots (placeholder for now)
+            align_score_1 = ""
+            align_text_1 = ""
+            align_plot_1 = None
+            align_score_2 = ""
+            align_text_2 = ""
+            align_plot_2 = None
+            return (
+                first_audio,
+                second_audio,
+                saved_files,
+                generation_info,
+                status_message,
+                seed_value_for_ui,
+                align_score_1,
+                align_text_1,
+                align_plot_1,
+                align_score_2,
+                align_text_2,
+                align_plot_2,
+            )
+        except Exception as e:
+            error_msg = f"❌ Error generating music: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return None, None, [], "", error_msg, "-1", "", "", None, "", "", None

acestep/third_parts/nano-vllm/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Xingkai Yu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

acestep/third_parts/nano-vllm/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+<p align="center">
+<img width="300" src="assets/logo.png">
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/15323" target="_blank"><img src="https://trendshift.io/api/badge/repositories/15323" alt="GeeeekExplorer%2Fnano-vllm | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+# Nano-vLLM
+A lightweight vLLM implementation built from scratch.
+## Key Features
+* 🚀 **Fast offline inference** - Comparable inference speeds to vLLM
+* 📖 **Readable codebase** - Clean implementation in ~ 1,200 lines of Python code
+* ⚡ **Optimization Suite** - Prefix caching, Tensor Parallelism, Torch compilation, CUDA graph, etc.
+## Installation
+```bash
+pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+## Model Download
+To download the model weights manually, use the following command:
+```bash
+huggingface-cli download --resume-download Qwen/Qwen3-0.6B \
+  --local-dir ~/huggingface/Qwen3-0.6B/ \
+  --local-dir-use-symlinks False
+```
+## Quick Start
+See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method:
+```python
+from nanovllm import LLM, SamplingParams
+llm = LLM("/YOUR/MODEL/PATH", enforce_eager=True, tensor_parallel_size=1)
+sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+prompts = ["Hello, Nano-vLLM."]
+outputs = llm.generate(prompts, sampling_params)
+outputs[0]["text"]
+```
+## Benchmark
+See `bench.py` for benchmark.
+**Test Configuration:**
+- Hardware: RTX 4070 Laptop (8GB)
+- Model: Qwen3-0.6B
+- Total Requests: 256 sequences
+- Input Length: Randomly sampled between 100–1024 tokens
+- Output Length: Randomly sampled between 100–1024 tokens
+**Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 98.37    | 1361.84               |
+| Nano-vLLM      | 133,966     | 93.41    | 1434.13               |
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=GeeeekExplorer/nano-vllm&type=Date)](https://www.star-history.com/#GeeeekExplorer/nano-vllm&Date)

acestep/third_parts/nano-vllm/assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 03ec4039dc248e97e9943694d3ccfb52c1a73a6dab94c4cd6fd4288e08de98c8
Pointer size: 131 Bytes
Size of remote file: 397 kB

acestep/third_parts/nano-vllm/bench.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import time
+from random import randint, seed
+from nanovllm import LLM, SamplingParams
+# from vllm import LLM, SamplingParams
+def main():
+    seed(0)
+    num_seqs = 256
+    max_input_len = 1024
+    max_ouput_len = 1024
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    llm = LLM(path, enforce_eager=False, max_model_len=4096)
+    prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
+    sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)]
+    # uncomment the following line for vllm
+    # prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
+    llm.generate(["Benchmark: "], SamplingParams())
+    t = time.time()
+    llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
+    t = (time.time() - t)
+    total_tokens = sum(sp.max_tokens for sp in sampling_params)
+    throughput = total_tokens / t
+    print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
+if __name__ == "__main__":
+    main()

acestep/third_parts/nano-vllm/example.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from nanovllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+def main():
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+    prompts = [
+        "introduce yourself",
+        "list all prime numbers within 100",
+    ]
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for prompt in prompts
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("\n")
+        print(f"Prompt: {prompt!r}")
+        print(f"Completion: {output['text']!r}")
+if __name__ == "__main__":
+    main()

acestep/third_parts/nano-vllm/nanovllm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from nanovllm.llm import LLM
2	+ from nanovllm.sampling_params import SamplingParams

acestep/third_parts/nano-vllm/nanovllm/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from dataclasses import dataclass
+from transformers import AutoConfig
+@dataclass
+class Config:
+    model: str
+    max_num_batched_tokens: int = 16384
+    max_num_seqs: int = 512
+    max_model_len: int = 4096
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: AutoConfig | None = None
+    eos: int = -1
+    kvcache_block_size: int = 256
+    num_kvcache_blocks: int = -1
+    def __post_init__(self):
+        assert os.path.isdir(self.model)
+        assert self.kvcache_block_size % 256 == 0
+        assert 1 <= self.tensor_parallel_size <= 8
+        self.hf_config = AutoConfig.from_pretrained(self.model)
+        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
+        assert self.max_num_batched_tokens >= self.max_model_len

acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from collections import deque
+import xxhash
+import numpy as np
+from nanovllm.engine.sequence import Sequence
+class Block:
+    def __init__(self, block_id):
+        self.block_id = block_id
+        self.ref_count = 0
+        self.hash = -1
+        self.token_ids = []
+    def update(self, hash: int, token_ids: list[int]):
+        self.hash = hash
+        self.token_ids = token_ids
+    def reset(self):
+        self.ref_count = 1
+        self.hash = -1
+        self.token_ids = []
+class BlockManager:
+    def __init__(self, num_blocks: int, block_size: int):
+        self.block_size = block_size
+        self.blocks: list[Block] = [Block(i) for i in range(num_blocks)]
+        self.hash_to_block_id: dict[int, int] = dict()
+        self.free_block_ids: deque[int] = deque(range(num_blocks))
+        self.used_block_ids: set[int] = set()
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+    def _allocate_block(self, block_id: int) -> Block:
+        block = self.blocks[block_id]
+        assert block.ref_count == 0
+        block.reset()
+        self.free_block_ids.remove(block_id)
+        self.used_block_ids.add(block_id)
+        return self.blocks[block_id]
+    def _deallocate_block(self, block_id: int) -> Block:
+        assert self.blocks[block_id].ref_count == 0
+        self.used_block_ids.remove(block_id)
+        self.free_block_ids.append(block_id)
+    def can_allocate(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= seq.num_blocks
+    def allocate(self, seq: Sequence):
+        assert not seq.block_table
+        h = -1
+        cache_miss = False
+        for i in range(seq.num_blocks):
+            token_ids = seq.block(i)
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            block_id = self.hash_to_block_id.get(h, -1)
+            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
+                cache_miss = True
+            if cache_miss:
+                block_id = self.free_block_ids[0]
+                block = self._allocate_block(block_id)
+            else:
+                seq.num_cached_tokens += self.block_size
+                if block_id in self.used_block_ids:
+                    block = self.blocks[block_id]
+                    block.ref_count += 1
+                else:
+                    block = self._allocate_block(block_id)
+            if h != -1:
+                block.update(h, token_ids)
+                self.hash_to_block_id[h] = block_id
+            seq.block_table.append(block_id)
+    def deallocate(self, seq: Sequence):
+        for block_id in reversed(seq.block_table):
+            block = self.blocks[block_id]
+            block.ref_count -= 1
+            if block.ref_count == 0:
+                self._deallocate_block(block_id)
+        seq.num_cached_tokens = 0
+        seq.block_table.clear()
+    def can_append(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)
+    def may_append(self, seq: Sequence):
+        block_table = seq.block_table
+        last_block = self.blocks[block_table[-1]]
+        if len(seq) % self.block_size == 1:
+            assert last_block.hash != -1
+            block_id = self.free_block_ids[0]
+            self._allocate_block(block_id)
+            block_table.append(block_id)
+        elif len(seq) % self.block_size == 0:
+            assert last_block.hash == -1
+            token_ids = seq.block(seq.num_blocks-1)
+            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
+            h = self.compute_hash(token_ids, prefix)
+            last_block.update(h, token_ids)
+            self.hash_to_block_id[h] = last_block.block_id
+        else:
+            assert last_block.hash == -1

acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import atexit
+from dataclasses import fields
+from time import perf_counter
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+import torch.multiprocessing as mp
+from nanovllm.config import Config
+from nanovllm.sampling_params import SamplingParams
+from nanovllm.engine.sequence import Sequence
+from nanovllm.engine.scheduler import Scheduler
+from nanovllm.engine.model_runner import ModelRunner
+class LLMEngine:
+    def __init__(self, model, **kwargs):
+        config_fields = {field.name for field in fields(Config)}
+        config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
+        config = Config(model, **config_kwargs)
+        self.ps = []
+        self.events = []
+        ctx = mp.get_context("spawn")
+        for i in range(1, config.tensor_parallel_size):
+            event = ctx.Event()
+            process = ctx.Process(target=ModelRunner, args=(config, i, event))
+            process.start()
+            self.ps.append(process)
+            self.events.append(event)
+        self.model_runner = ModelRunner(config, 0, self.events)
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
+        config.eos = self.tokenizer.eos_token_id
+        self.scheduler = Scheduler(config)
+        atexit.register(self.exit)
+    def exit(self):
+        self.model_runner.call("exit")
+        del self.model_runner
+        for p in self.ps:
+            p.join()
+    def add_request(self, prompt: str | list[int], sampling_params: SamplingParams, unconditional_prompt: str | list[int] | None = None):
+        if isinstance(prompt, str):
+            prompt = self.tokenizer.encode(prompt)
+        # For CFG: if cfg_scale > 1.0, create both conditional and unconditional sequences
+        if sampling_params.cfg_scale > 1.0:
+            if unconditional_prompt is None:
+                # Try to construct unconditional prompt by replacing user input with "NO USER INPUT"
+                # This is a fallback - ideally users should provide unconditional_prompt
+                if isinstance(prompt, list):
+                    # For now, just use the same prompt (user should provide unconditional_prompt)
+                    # TODO: Implement automatic "NO USER INPUT" replacement if possible
+                    unconditional_prompt = prompt
+                else:
+                    unconditional_prompt = prompt
+            if isinstance(unconditional_prompt, str):
+                unconditional_prompt = self.tokenizer.encode(unconditional_prompt)
+            # Create unconditional sequence first (so we can reference it from conditional)
+            uncond_seq = Sequence(unconditional_prompt, sampling_params, is_unconditional=True)
+            # Create conditional sequence with reference to unconditional
+            cond_seq = Sequence(prompt, sampling_params, is_unconditional=False, conditional_seq=uncond_seq)
+            uncond_seq.paired_seq = cond_seq  # Link them bidirectionally
+            # Add both sequences to scheduler
+            self.scheduler.add(cond_seq)
+            self.scheduler.add(uncond_seq)
+        else:
+            seq = Sequence(prompt, sampling_params)
+            self.scheduler.add(seq)
+    def step(self):
+        seqs, is_prefill = self.scheduler.schedule()
+        token_ids = self.model_runner.call("run", seqs, is_prefill)
+        self.scheduler.postprocess(seqs, token_ids)
+        # Only output conditional sequences (unconditional sequences are just for CFG computation)
+        output_seqs = [seq for seq in seqs if seq.is_finished and (seq.cfg_scale <= 1.0 or not seq.is_unconditional)]
+        outputs = [(seq.seq_id, seq.completion_token_ids) for seq in output_seqs]
+        num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len([s for s in seqs if not s.is_unconditional])
+        return outputs, num_tokens
+    def is_finished(self):
+        return self.scheduler.is_finished()
+    def generate(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+        unconditional_prompts: list[str] | list[list[int]] | None = None,
+    ) -> list[str]:
+        if use_tqdm:
+            pbar = tqdm(total=len(prompts), desc="Generating", dynamic_ncols=True)
+        if not isinstance(sampling_params, list):
+            sampling_params = [sampling_params] * len(prompts)
+        if unconditional_prompts is None:
+            unconditional_prompts = [None] * len(prompts)
+        for prompt, sp, uncond_prompt in zip(prompts, sampling_params, unconditional_prompts):
+            self.add_request(prompt, sp, uncond_prompt)
+        outputs = {}
+        prefill_throughput = decode_throughput = 0.
+        while not self.is_finished():
+            t = perf_counter()
+            output, num_tokens = self.step()
+            if use_tqdm:
+                if num_tokens > 0:
+                    prefill_throughput = num_tokens / (perf_counter() - t)
+                else:
+                    decode_throughput = -num_tokens / (perf_counter() - t)
+                pbar.set_postfix({
+                    "Prefill": f"{int(prefill_throughput)}tok/s",
+                    "Decode": f"{int(decode_throughput)}tok/s",
+                })
+            for seq_id, token_ids in output:
+                outputs[seq_id] = token_ids
+                if use_tqdm:
+                    pbar.update(1)
+        outputs = [outputs[seq_id] for seq_id in sorted(outputs.keys())]
+        outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs]
+        if use_tqdm:
+            pbar.close()
+        return outputs

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import pickle
+import torch
+import torch.distributed as dist
+from multiprocessing.synchronize import Event
+from multiprocessing.shared_memory import SharedMemory
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence
+from nanovllm.models.qwen3 import Qwen3ForCausalLM
+from nanovllm.layers.sampler import Sampler
+from nanovllm.utils.context import set_context, get_context, reset_context
+from nanovllm.utils.loader import load_model
+class ModelRunner:
+    def __init__(self, config: Config, rank: int, event: Event | list[Event]):
+        self.config = config
+        hf_config = config.hf_config
+        self.block_size = config.kvcache_block_size
+        self.enforce_eager = config.enforce_eager
+        self.world_size = config.tensor_parallel_size
+        self.rank = rank
+        self.event = event
+        dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
+        torch.cuda.set_device(rank)
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(hf_config.torch_dtype)
+        torch.set_default_device("cuda")
+        self.model = Qwen3ForCausalLM(hf_config)
+        load_model(self.model, config.model)
+        self.sampler = Sampler()
+        self.warmup_model()
+        self.allocate_kv_cache()
+        if not self.enforce_eager:
+            self.capture_cudagraph()
+        torch.set_default_device("cpu")
+        torch.set_default_dtype(default_dtype)
+        if self.world_size > 1:
+            if rank == 0:
+                self.shm = SharedMemory(name="nanovllm", create=True, size=2**20)
+                dist.barrier()
+            else:
+                dist.barrier()
+                self.shm = SharedMemory(name="nanovllm")
+                self.loop()
+    def exit(self):
+        if self.world_size > 1:
+            self.shm.close()
+            dist.barrier()
+            if self.rank == 0:
+                self.shm.unlink()
+        if not self.enforce_eager:
+            del self.graphs, self.graph_pool
+        torch.cuda.synchronize()
+        dist.destroy_process_group()
+    def loop(self):
+        while True:
+            method_name, args = self.read_shm()
+            self.call(method_name, *args)
+            if method_name == "exit":
+                break
+    def read_shm(self):
+        assert self.world_size > 1 and self.rank > 0
+        self.event.wait()
+        n = int.from_bytes(self.shm.buf[0:4], "little")
+        method_name, *args = pickle.loads(self.shm.buf[4:n+4])
+        self.event.clear()
+        return method_name, args
+    def write_shm(self, method_name, *args):
+        assert self.world_size > 1 and self.rank == 0
+        data = pickle.dumps([method_name, *args])
+        n = len(data)
+        self.shm.buf[0:4] = n.to_bytes(4, "little")
+        self.shm.buf[4:n+4] = data
+        for event in self.event:
+            event.set()
+    def call(self, method_name, *args):
+        if self.world_size > 1 and self.rank == 0:
+            self.write_shm(method_name, *args)
+        method = getattr(self, method_name, None)
+        return method(*args)
+    def warmup_model(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        max_num_batched_tokens, max_model_len = self.config.max_num_batched_tokens, self.config.max_model_len
+        num_seqs = min(max_num_batched_tokens // max_model_len, self.config.max_num_seqs)
+        seqs = [Sequence([0] * max_model_len) for _ in range(num_seqs)]
+        self.run(seqs, True)
+        torch.cuda.empty_cache()
+    def allocate_kv_cache(self):
+        config = self.config
+        hf_config = config.hf_config
+        free, total = torch.cuda.mem_get_info()
+        used = total - free
+        peak = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+        current = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+        num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
+        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.torch_dtype.itemsize
+        config.num_kvcache_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
+        assert config.num_kvcache_blocks > 0
+        self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
+        layer_id = 0
+        for module in self.model.modules():
+            if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
+                module.k_cache = self.kv_cache[0, layer_id]
+                module.v_cache = self.kv_cache[1, layer_id]
+                layer_id += 1
+    def prepare_block_tables(self, seqs: list[Sequence]):
+        max_len = max(len(seq.block_table) for seq in seqs)
+        block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
+        block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        return block_tables
+    def prepare_prefill(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        cu_seqlens_q = [0]
+        cu_seqlens_k = [0]
+        max_seqlen_q = 0
+        max_seqlen_k = 0
+        slot_mapping = []
+        block_tables = None
+        for seq in seqs:
+            seqlen = len(seq)
+            input_ids.extend(seq[seq.num_cached_tokens:])
+            positions.extend(list(range(seq.num_cached_tokens, seqlen)))
+            seqlen_q = seqlen - seq.num_cached_tokens
+            seqlen_k = seqlen
+            cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)
+            cu_seqlens_k.append(cu_seqlens_k[-1] + seqlen_k)
+            max_seqlen_q = max(seqlen_q, max_seqlen_q)
+            max_seqlen_k = max(seqlen_k, max_seqlen_k)
+            if not seq.block_table:    # warmup
+                continue
+            for i in range(seq.num_cached_blocks, seq.num_blocks):
+                start = seq.block_table[i] * self.block_size
+                if i != seq.num_blocks - 1:
+                    end = start + self.block_size
+                else:
+                    end = start + seq.last_block_num_tokens
+                slot_mapping.extend(list(range(start, end)))
+        if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
+            block_tables = self.prepare_block_tables(seqs)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_q = torch.tensor(cu_seqlens_q, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, None, block_tables)
+        return input_ids, positions
+    def prepare_decode(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        slot_mapping = []
+        context_lens = []
+        for seq in seqs:
+            input_ids.append(seq.last_token)
+            positions.append(len(seq) - 1)
+            context_lens.append(len(seq))
+            slot_mapping.append(seq.block_table[-1] * self.block_size + seq.last_block_num_tokens  - 1)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        context_lens = torch.tensor(context_lens, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        block_tables = self.prepare_block_tables(seqs)
+        set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
+        return input_ids, positions
+    def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
+        """Prepare sampling parameters. For CFG batch, only return parameters for conditional sequences."""
+        if is_cfg_batch:
+            # For CFG batch, seqs contains [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            # We only need temperatures for conditional sequences (first half)
+            num_cond = len(seqs) // 2
+            temperatures = []
+            cfg_scales = []
+            for seq in seqs[:num_cond]:
+                temperatures.append(seq.temperature)
+                cfg_scales.append(seq.cfg_scale)
+        else:
+            temperatures = []
+            cfg_scales = []
+            for seq in seqs:
+                temperatures.append(seq.temperature)
+                cfg_scales.append(seq.cfg_scale)
+        temperatures = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        cfg_scales = torch.tensor(cfg_scales, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        return temperatures, cfg_scales
+    @torch.inference_mode()
+    def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
+        if is_prefill or self.enforce_eager or input_ids.size(0) > 512:
+            return self.model.compute_logits(self.model(input_ids, positions))
+        else:
+            bs = input_ids.size(0)
+            context = get_context()
+            graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
+            graph_vars = self.graph_vars
+            graph_vars["input_ids"][:bs] = input_ids
+            graph_vars["positions"][:bs] = positions
+            graph_vars["slot_mapping"].fill_(-1)
+            graph_vars["slot_mapping"][:bs] = context.slot_mapping
+            graph_vars["context_lens"].zero_()
+            graph_vars["context_lens"][:bs] = context.context_lens
+            graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
+            graph.replay()
+            return self.model.compute_logits(graph_vars["outputs"][:bs])
+    def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
+        """Run model forward and sampling. For CFG sequences, batch is structured as:
+        [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+        where uncond_seqi is the paired unconditional sequence of cond_seqi."""
+        # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
+        is_cfg_batch = False
+        if len(seqs) > 0:
+            # CFG batch if first sequence has cfg_scale > 1.0 and paired_seq
+            if seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
+                is_cfg_batch = True
+                # Verify batch structure: first half conditional, second half unconditional
+                num_cond = len(seqs) // 2
+                for i in range(num_cond):
+                    if seqs[i].is_unconditional or seqs[i + num_cond].is_unconditional == False:
+                        is_cfg_batch = False
+                        break
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            uncond_seqs = seqs[num_cond:]
+            # Prepare inputs for both conditional and unconditional (they're already in the batch)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
+                                   else self.prepare_decode(seqs))
+            temperatures, cfg_scales = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else (None, None)
+            # Run model forward (processes entire batch: cond + uncond)
+            logits_all = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            if self.rank == 0:
+                # Split logits: first half is conditional, second half is unconditional
+                logits_cond = logits_all[:num_cond]
+                logits_uncond = logits_all[num_cond:]
+                # Apply CFG formula: logits_cfg = logits_cond + cfg_scale * (logits_cond - logits_uncond)
+                cfg_scales_tensor = cfg_scales.unsqueeze(1)  # [num_cond, 1]
+                logits_cfg = logits_cond + cfg_scales_tensor * (logits_cond - logits_uncond)
+                # Sample from CFG logits
+                token_ids_cfg = self.sampler(logits_cfg, temperatures).tolist()
+                # Return token_ids (will be applied to both conditional and unconditional sequences)
+                return token_ids_cfg
+            else:
+                return None
+        else:
+            # Normal batch (non-CFG)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
+                                   else self.prepare_decode(seqs))
+            temperatures, cfg_scales = self.prepare_sample(seqs, is_cfg_batch=False) if self.rank == 0 else (None, None)
+            logits = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None
+            return token_ids
+    @torch.inference_mode()
+    def capture_cudagraph(self):
+        config = self.config
+        hf_config = config.hf_config
+        max_bs = min(self.config.max_num_seqs, 512)
+        max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
+        input_ids = torch.zeros(max_bs, dtype=torch.int64)
+        positions = torch.zeros(max_bs, dtype=torch.int64)
+        slot_mapping = torch.zeros(max_bs, dtype=torch.int32)
+        context_lens = torch.zeros(max_bs, dtype=torch.int32)
+        block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32)
+        outputs = torch.zeros(max_bs, hf_config.hidden_size)
+        self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16))
+        self.graphs = {}
+        self.graph_pool = None
+        for bs in reversed(self.graph_bs):
+            graph = torch.cuda.CUDAGraph()
+            set_context(False, slot_mapping=slot_mapping[:bs], context_lens=context_lens[:bs], block_tables=block_tables[:bs])
+            outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # warmup
+            with torch.cuda.graph(graph, self.graph_pool):
+                outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # capture
+            if self.graph_pool is None:
+                self.graph_pool = graph.pool()
+            self.graphs[bs] = graph
+            torch.cuda.synchronize()
+            reset_context()
+        self.graph_vars = dict(
+            input_ids=input_ids,
+            positions=positions,
+            slot_mapping=slot_mapping,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            outputs=outputs,
+        )

acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from collections import deque
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence, SequenceStatus
+from nanovllm.engine.block_manager import BlockManager
+class Scheduler:
+    def __init__(self, config: Config):
+        self.max_num_seqs = config.max_num_seqs
+        self.max_num_batched_tokens = config.max_num_batched_tokens
+        self.eos = config.eos
+        self.block_manager = BlockManager(config.num_kvcache_blocks, config.kvcache_block_size)
+        self.waiting: deque[Sequence] = deque()
+        self.running: deque[Sequence] = deque()
+    def is_finished(self):
+        return not self.waiting and not self.running
+    def add(self, seq: Sequence):
+        self.waiting.append(seq)
+    def schedule(self) -> tuple[list[Sequence], bool]:
+        # prefill
+        scheduled_seqs = []
+        num_seqs = 0
+        num_batched_tokens = 0
+        processed_seqs = set()  # Track processed sequences to handle CFG pairs
+        while self.waiting and num_seqs < self.max_num_seqs:
+            seq = self.waiting[0]
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                # This is a conditional sequence, need to schedule its paired unconditional sequence too
+                paired_seq = seq.paired_seq
+                if paired_seq.status != SequenceStatus.WAITING:
+                    # Paired sequence not in waiting, skip this conditional sequence for now
+                    break
+                # Calculate tokens for both sequences
+                total_tokens = (len(seq) - seq.num_cached_tokens) + (len(paired_seq) - paired_seq.num_cached_tokens)
+                can_allocate_both = (self.block_manager.can_allocate(seq) and
+                                    self.block_manager.can_allocate(paired_seq))
+                if num_batched_tokens + total_tokens > self.max_num_batched_tokens or not can_allocate_both:
+                    break
+                # Schedule both sequences: conditional first, then unconditional
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.allocate(s)
+                    num_batched_tokens += len(s) - s.num_cached_tokens
+                    s.status = SequenceStatus.RUNNING
+                    self.waiting.remove(s)
+                    self.running.append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+            else:
+                # Normal sequence or unconditional sequence (already processed with its conditional)
+                if seq.seq_id in processed_seqs:
+                    # Skip if already processed as part of a CFG pair
+                    self.waiting.popleft()
+                    continue
+                if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq):
+                    break
+                num_seqs += 1
+                self.block_manager.allocate(seq)
+                num_batched_tokens += len(seq) - seq.num_cached_tokens
+                seq.status = SequenceStatus.RUNNING
+                self.waiting.popleft()
+                self.running.append(seq)
+                scheduled_seqs.append(seq)
+        if scheduled_seqs:
+            # For CFG batches, ensure conditional sequences come before their unconditional pairs
+            cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+            cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+            non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+            # Reorder: non-CFG, then CFG conditional, then CFG unconditional
+            scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+            return scheduled_seqs, True
+        # decode
+        processed_seqs = set()
+        temp_running = list(self.running)  # Work with a copy
+        while temp_running and num_seqs < self.max_num_seqs:
+            seq = temp_running.pop(0)
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                paired_seq = seq.paired_seq
+                if paired_seq not in temp_running:
+                    # Paired sequence not available, skip for now
+                    continue
+                # Remove paired_seq from temp_running
+                temp_running.remove(paired_seq)
+                # Check if both can append
+                can_append_both = (self.block_manager.can_append(seq) and
+                                  self.block_manager.can_append(paired_seq))
+                if not can_append_both:
+                    # Try preempting other sequences
+                    preempted = False
+                    while not can_append_both and temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq and other_seq != paired_seq:
+                            self.preempt(other_seq)
+                            can_append_both = (self.block_manager.can_append(seq) and
+                                              self.block_manager.can_append(paired_seq))
+                            preempted = True
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    if not can_append_both:
+                        # Can't schedule this pair right now
+                        temp_running.append(seq)
+                        temp_running.append(paired_seq)
+                        continue
+                # Schedule both sequences
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.may_append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+                    # Remove from actual running list if scheduled
+                    if s in self.running:
+                        self.running.remove(s)
+            else:
+                # Normal sequence or unconditional (already processed)
+                if seq.seq_id in processed_seqs:
+                    continue
+                while not self.block_manager.can_append(seq):
+                    if temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq:
+                            self.preempt(other_seq)
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    else:
+                        self.preempt(seq)
+                        if seq in self.running:
+                            self.running.remove(seq)
+                        break
+                else:
+                    num_seqs += 1
+                    self.block_manager.may_append(seq)
+                    scheduled_seqs.append(seq)
+                    if seq in self.running:
+                        self.running.remove(seq)
+        assert scheduled_seqs
+        # For CFG batches in decode, ensure conditional sequences come before unconditional
+        cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+        cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+        non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+        scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+        self.running.extendleft(reversed(scheduled_seqs))
+        return scheduled_seqs, False
+    def preempt(self, seq: Sequence):
+        seq.status = SequenceStatus.WAITING
+        self.block_manager.deallocate(seq)
+        self.waiting.appendleft(seq)
+    def postprocess(self, seqs: list[Sequence], token_ids: list[int]) -> list[bool]:
+        # Check if this is a CFG batch
+        is_cfg_batch = False
+        if len(seqs) > 0 and seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
+            num_cond = len(seqs) // 2
+            is_cfg_batch = (num_cond > 0 and
+                           not seqs[0].is_unconditional and
+                           seqs[num_cond].is_unconditional)
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            # token_ids correspond to conditional sequences only (sampled from CFG logits)
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            uncond_seqs = seqs[num_cond:]
+            # Apply the same sampled token to both conditional and unconditional sequences
+            for i, (cond_seq, uncond_seq, token_id) in enumerate(zip(cond_seqs, uncond_seqs, token_ids)):
+                cond_seq.append_token(token_id)
+                uncond_seq.append_token(token_id)  # Same token for unconditional
+                # Check if either sequence is finished
+                cond_finished = ((not cond_seq.ignore_eos and token_id == self.eos) or
+                                cond_seq.num_completion_tokens == cond_seq.max_tokens)
+                uncond_finished = ((not uncond_seq.ignore_eos and token_id == self.eos) or
+                                  uncond_seq.num_completion_tokens == uncond_seq.max_tokens)
+                if cond_finished or uncond_finished:
+                    # Mark both as finished
+                    cond_seq.status = SequenceStatus.FINISHED
+                    uncond_seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(cond_seq)
+                    self.block_manager.deallocate(uncond_seq)
+                    if cond_seq in self.running:
+                        self.running.remove(cond_seq)
+                    if uncond_seq in self.running:
+                        self.running.remove(uncond_seq)
+        else:
+            # Normal batch
+            for seq, token_id in zip(seqs, token_ids):
+                seq.append_token(token_id)
+                if (not seq.ignore_eos and token_id == self.eos) or seq.num_completion_tokens == seq.max_tokens:
+                    seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(seq)
+                    self.running.remove(seq)

acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from copy import copy
+from enum import Enum, auto
+from itertools import count
+from nanovllm.sampling_params import SamplingParams
+class SequenceStatus(Enum):
+    WAITING = auto()
+    RUNNING = auto()
+    FINISHED = auto()
+class Sequence:
+    block_size = 256
+    counter = count()
+    def __init__(self, token_ids: list[int], sampling_params = SamplingParams(), is_unconditional: bool = False, conditional_seq = None):
+        self.seq_id = next(Sequence.counter)
+        self.status = SequenceStatus.WAITING
+        self.token_ids = copy(token_ids)
+        self.last_token = token_ids[-1]
+        self.num_tokens = len(self.token_ids)
+        self.num_prompt_tokens = len(token_ids)
+        self.num_cached_tokens = 0
+        self.block_table = []
+        self.temperature = sampling_params.temperature
+        self.max_tokens = sampling_params.max_tokens
+        self.ignore_eos = sampling_params.ignore_eos
+        self.cfg_scale = sampling_params.cfg_scale
+        # For CFG: mark if this is an unconditional sequence
+        self.is_unconditional = is_unconditional
+        # For CFG: reference to the corresponding conditional sequence (if this is unconditional)
+        # For conditional sequences, this points to the unconditional sequence
+        self.paired_seq = conditional_seq  # For conditional seq, points to uncond; for uncond seq, points to cond
+    def __len__(self):
+        return self.num_tokens
+    def __getitem__(self, key):
+        return self.token_ids[key]
+    @property
+    def is_finished(self):
+        return self.status == SequenceStatus.FINISHED
+    @property
+    def num_completion_tokens(self):
+        return self.num_tokens - self.num_prompt_tokens
+    @property
+    def prompt_token_ids(self):
+        return self.token_ids[:self.num_prompt_tokens]
+    @property
+    def completion_token_ids(self):
+        return self.token_ids[self.num_prompt_tokens:]
+    @property
+    def num_cached_blocks(self):
+        return self.num_cached_tokens // self.block_size
+    @property
+    def num_blocks(self):
+        return (self.num_tokens + self.block_size - 1) // self.block_size
+    @property
+    def last_block_num_tokens(self):
+        return self.num_tokens - (self.num_blocks - 1) * self.block_size
+    def block(self, i):
+        assert 0 <= i < self.num_blocks
+        return self.token_ids[i*self.block_size: (i+1)*self.block_size]
+    def append_token(self, token_id: int):
+        self.token_ids.append(token_id)
+        self.last_token = token_id
+        self.num_tokens += 1
+    def __getstate__(self):
+        return (self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table,
+                self.token_ids if self.num_completion_tokens == 0 else self.last_token)
+    def __setstate__(self, state):
+        self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table = state[:-1]
+        if self.num_completion_tokens == 0:
+            self.token_ids = state[-1]
+        else:
+            self.last_token = state[-1]

acestep/third_parts/nano-vllm/nanovllm/layers/activation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+class SiluAndMul(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, y = x.chunk(2, -1)
+        return F.silu(x) * y

acestep/third_parts/nano-vllm/nanovllm/layers/attention.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from torch import nn
+import triton
+import triton.language as tl
+from flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from nanovllm.utils.context import get_context
+@triton.jit
+def store_kvcache_kernel(
+    key_ptr,
+    key_stride,
+    value_ptr,
+    value_stride,
+    k_cache_ptr,
+    v_cache_ptr,
+    slot_mapping_ptr,
+    D: tl.constexpr,
+):
+    idx = tl.program_id(0)
+    slot = tl.load(slot_mapping_ptr + idx)
+    if slot == -1: return
+    key_offsets = idx * key_stride + tl.arange(0, D)
+    value_offsets = idx * value_stride + tl.arange(0, D)
+    key = tl.load(key_ptr + key_offsets)
+    value = tl.load(value_ptr + value_offsets)
+    cache_offsets = slot * D + tl.arange(0, D)
+    tl.store(k_cache_ptr + cache_offsets, key)
+    tl.store(v_cache_ptr + cache_offsets, value)
+def store_kvcache(key: torch.Tensor, value: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, slot_mapping: torch.Tensor):
+    N, num_heads, head_dim = key.shape
+    D = num_heads * head_dim
+    assert key.stride(-1) == 1 and value.stride(-1) == 1
+    assert key.stride(1) == head_dim and value.stride(1) == head_dim
+    assert k_cache.stride(1) == D and v_cache.stride(1) == D
+    assert slot_mapping.numel() == N
+    store_kvcache_kernel[(N,)](key, key.stride(0), value, value.stride(0), k_cache, v_cache, slot_mapping, D)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        head_dim,
+        scale,
+        num_kv_heads,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = scale
+        self.num_kv_heads = num_kv_heads
+        self.k_cache = self.v_cache = torch.tensor([])
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        context = get_context()
+        k_cache, v_cache = self.k_cache, self.v_cache
+        if k_cache.numel() and v_cache.numel():
+            store_kvcache(k, v, k_cache, v_cache, context.slot_mapping)
+        if context.is_prefill:
+            if context.block_tables is not None:    # prefix cache
+                k, v = k_cache, v_cache
+            o = flash_attn_varlen_func(q, k, v,
+                                       max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                       max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                       softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+        else:    # decode
+            o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
+                                        cache_seqlens=context.context_lens, block_table=context.block_tables,
+                                        softmax_scale=self.scale, causal=True)
+        return o

acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from nanovllm.utils.context import get_context
+class VocabParallelEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+    ):
+        super().__init__()
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        assert num_embeddings % self.tp_size == 0
+        self.num_embeddings = num_embeddings
+        self.num_embeddings_per_partition = self.num_embeddings // self.tp_size
+        self.vocab_start_idx = self.num_embeddings_per_partition * self.tp_rank
+        self.vocab_end_idx = self.vocab_start_idx + self.num_embeddings_per_partition
+        self.weight = nn.Parameter(torch.empty(self.num_embeddings_per_partition, embedding_dim))
+        self.weight.weight_loader = self.weight_loader
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(0)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor):
+        if self.tp_size > 1:
+            mask = (x >= self.vocab_start_idx) & (x < self.vocab_end_idx)
+            x = mask * (x - self.vocab_start_idx)
+        y = F.embedding(x, self.weight)
+        if self.tp_size > 1:
+            y = mask.unsqueeze(1) * y
+            dist.all_reduce(y)
+        return y
+class ParallelLMHead(VocabParallelEmbedding):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        bias: bool = False,
+    ):
+        assert not bias
+        super().__init__(num_embeddings, embedding_dim)
+    def forward(self, x: torch.Tensor):
+        context = get_context()
+        if context.is_prefill:
+            last_indices = context.cu_seqlens_q[1:] - 1
+            x = x[last_indices].contiguous()
+        logits = F.linear(x, self.weight)
+        if self.tp_size > 1:
+            all_logits = [torch.empty_like(logits) for _ in range(self.tp_size)] if self.tp_rank == 0 else None
+            dist.gather(logits, all_logits, 0)
+            logits = torch.cat(all_logits, -1) if self.tp_rank == 0 else None
+        return logits

acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    @torch.compile
+    def rms_forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.float()
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x
+    @torch.compile
+    def add_rms_forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_dtype = x.dtype
+        x = x.float().add_(residual.float())
+        residual = x.to(orig_dtype)
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x, residual
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            return self.rms_forward(x)
+        else:
+            return self.add_rms_forward(x, residual)

acestep/third_parts/nano-vllm/nanovllm/layers/linear.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+def divide(numerator, denominator):
+    assert numerator % denominator == 0
+    return numerator // denominator
+class LinearBase(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+        tp_dim: int | None = None,
+    ):
+        super().__init__()
+        self.tp_dim = tp_dim
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        self.weight = nn.Parameter(torch.empty(output_size, input_size))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(output_size))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+class ReplicatedLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param.data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class ColumnParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(input_size, divide(output_size, tp_size), bias, 0)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size, sum(output_sizes), bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int):
+        param_data = param.data
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class QKVParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: int | None = None,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        total_num_kv_heads = total_num_kv_heads or total_num_heads
+        self.head_size = head_size
+        self.num_heads = divide(total_num_heads, tp_size)
+        self.num_kv_heads = divide(total_num_kv_heads, tp_size)
+        output_size = (total_num_heads + 2 * total_num_kv_heads) * self.head_size
+        super().__init__(hidden_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: str):
+        param_data = param.data
+        assert loaded_shard_id in ["q", "k", "v"]
+        if loaded_shard_id == "q":
+            shard_size = self.num_heads * self.head_size
+            shard_offset = 0
+        elif loaded_shard_id == "k":
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size
+        else:
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size + self.num_kv_heads * self.head_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class RowParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(divide(input_size, tp_size), output_size, bias, 1)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        y = F.linear(x, self.weight, self.bias if self.tp_rank == 0 else None)
+        if self.tp_size > 1:
+            dist.all_reduce(y)
+        return y

acestep/third_parts/nano-vllm/nanovllm/layers/rotary_embedding.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from functools import lru_cache
+import torch
+from torch import nn
+def apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    x1, x2 = torch.chunk(x.float(), 2, dim=-1)
+    y1 = x1 * cos - x2 * sin
+    y2 = x2 * cos + x1 * sin
+    return torch.cat((y1, y2), dim=-1).to(x.dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        assert rotary_dim == head_size
+        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1).unsqueeze_(1)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+    @torch.compile
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        query = apply_rotary_emb(query, cos, sin)
+        key = apply_rotary_emb(key, cos, sin)
+        return query, key
+@lru_cache(1)
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: float,
+    rope_scaling: dict | None = None,
+):
+    assert rope_scaling is None
+    rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base)
+    return rotary_emb

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+from torch import nn
+class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(self, logits: torch.Tensor, temperatures: torch.Tensor):
+        logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        probs = torch.softmax(logits, dim=-1)
+        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
+        return sample_tokens

acestep/third_parts/nano-vllm/nanovllm/llm.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from nanovllm.engine.llm_engine import LLMEngine
+class LLM(LLMEngine):
+    pass

acestep/third_parts/nano-vllm/nanovllm/models/qwen3.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+from torch import nn
+import torch.distributed as dist
+from transformers import Qwen3Config
+from nanovllm.layers.activation import SiluAndMul
+from nanovllm.layers.attention import Attention
+from nanovllm.layers.layernorm import RMSNorm
+from nanovllm.layers.linear import QKVParallelLinear, MergedColumnParallelLinear, RowParallelLinear
+from nanovllm.layers.rotary_embedding import get_rope
+from nanovllm.layers.embed_head import VocabParallelEmbedding, ParallelLMHead
+class Qwen3Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        rope_theta: float = 10000,
+        rope_scaling: tuple | None = None,
+    ) -> None:
+        super().__init__()
+        tp_size = dist.get_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        assert self.total_num_kv_heads % tp_size == 0
+        self.num_kv_heads = self.total_num_kv_heads // tp_size
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim ** -0.5
+        self.qkv_bias = qkv_bias
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_kv_heads,
+        )
+        if not self.qkv_bias:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(-1, self.num_heads, self.head_dim)
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        v = v.view(-1, self.num_kv_heads, self.head_dim)
+        if not self.qkv_bias:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q, k = self.rotary_emb(positions, q, k)
+        o = self.attn(q, k, v)
+        output = self.o_proj(o.flatten(1, -1))
+        return output
+class Qwen3MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+        )
+        assert hidden_act == "silu"
+        self.act_fn = SiluAndMul()
+    def forward(self, x):
+        gate_up = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x = self.down_proj(x)
+        return x
+class Qwen3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3Config,
+    ) -> None:
+        super().__init__()
+        self.self_attn = Qwen3Attention(
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position=config.max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', True),
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=getattr(config, "rope_theta", 1000000),
+            rope_scaling=getattr(config, "rope_scaling", None),
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            hidden_states, residual = self.input_layernorm(hidden_states), hidden_states
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(positions, hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+class Qwen3Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3Config,
+    ) -> None:
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([Qwen3DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+class Qwen3ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "q_proj": ("qkv_proj", "q"),
+        "k_proj": ("qkv_proj", "k"),
+        "v_proj": ("qkv_proj", "v"),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+    def __init__(
+        self,
+        config: Qwen3Config
+    ) -> None:
+        super().__init__()
+        self.model = Qwen3Model(config)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        if config.tie_word_embeddings:
+            self.lm_head.weight.data = self.model.embed_tokens.weight.data
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions)
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.lm_head(hidden_states)

acestep/third_parts/nano-vllm/nanovllm/sampling_params.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from dataclasses import dataclass
+@dataclass
+class SamplingParams:
+    temperature: float = 1.0
+    max_tokens: int = 64
+    ignore_eos: bool = False
+    cfg_scale: float = 1.0  # CFG guidance scale. When > 1.0, applies classifier-free guidance
+    def __post_init__(self):
+        assert self.temperature > 1e-10, "greedy sampling is not permitted"
+        assert self.cfg_scale >= 1.0, "cfg_scale must be >= 1.0"

acestep/third_parts/nano-vllm/nanovllm/utils/context.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from dataclasses import dataclass
+import torch
+@dataclass
+class Context:
+    is_prefill: bool = False
+    cu_seqlens_q: torch.Tensor | None = None
+    cu_seqlens_k: torch.Tensor | None = None
+    max_seqlen_q: int = 0
+    max_seqlen_k: int = 0
+    slot_mapping: torch.Tensor | None = None
+    context_lens: torch.Tensor | None = None
+    block_tables: torch.Tensor | None = None
+_CONTEXT = Context()
+def get_context():
+    return _CONTEXT
+def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
+    global _CONTEXT
+    _CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
+def reset_context():
+    global _CONTEXT
+    _CONTEXT = Context()

acestep/third_parts/nano-vllm/nanovllm/utils/loader.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from glob import glob
+import torch
+from torch import nn
+from safetensors import safe_open
+def default_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor):
+    param.data.copy_(loaded_weight)
+def load_model(model: nn.Module, path: str):
+    packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
+    for file in glob(os.path.join(path, "*.safetensors")):
+        with safe_open(file, "pt", "cpu") as f:
+            for weight_name in f.keys():
+                for k in packed_modules_mapping:
+                    if k in weight_name:
+                        v, shard_id = packed_modules_mapping[k]
+                        param_name = weight_name.replace(k, v)
+                        param = model.get_parameter(param_name)
+                        weight_loader = getattr(param, "weight_loader")
+                        weight_loader(param, f.get_tensor(weight_name), shard_id)
+                        break
+                else:
+                    param = model.get_parameter(weight_name)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, f.get_tensor(weight_name))

acestep/third_parts/nano-vllm/pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "nano-vllm"
+version = "0.2.0"
+authors = [{ name = "Xingkai Yu" }]
+license = "MIT"
+license-files = ["LICENSE"]
+readme = "README.md"
+description = "a lightweight vLLM implementation built from scratch"
+requires-python = ">=3.10,<3.13"
+dependencies = [
+    "torch>=2.4.0",
+    "triton>=3.0.0",
+    "transformers>=4.51.0",
+    "flash-attn",
+    "xxhash",
+]
+[project.urls]
+Homepage="https://github.com/GeeeekExplorer/nano-vllm"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["nanovllm*"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+transformers
+diffusers
+gradio