Blackwell

#25
by Kwissbeats - opened

I see Blackwell (50xx) is not mentioned to be supported, is there anything preventing it to work on those cards?

actually this model can run in 50 series card, last night i seccessly run in my cloud server

actually this model can run in 50 series card, last night i seccessly run in my cloud server

Can you share how you set it up mate? And how's it holding up? Are conversations good?

actually this model can run in 50 series card, last night i seccessly run in my cloud server

Can you share how you set it up mate? And how's it holding up? Are conversations good?

first go to the repo and accept the terms so u get the download access.
if u r using mini conda:

conda create -n nvidia python=3.11
connda activate nvidia
pip install  gradio spaces torch numpy huggingface_hub sentencepiece sphn safetensors git+https://github.com/NVIDIA/personaplex.git#subdirectory=moshi

install hugginface cli
windows: powershell -ExecutionPolicy ByPass -c "irm https://hf.co/cli/install.ps1 | iex"
Linux: curl -LsSf https://hf.co/cli/install.sh | bash
restart the terminal.
run: hf login

go to https://huggingface.co/settings/tokens and 'Create new token' -> 'Read' -> 'Token name ' -> 'Create token'
Copy the token and right click , paste.
save the code below as app.py and run it
python app.py
(CPU offload)

import gradio as gr
import spaces
import torch
import numpy as np
import os
import tarfile
from pathlib import Path
from typing import Optional
from huggingface_hub import hf_hub_download
import sentencepiece

# ==========================================
# CONFIGURATION
# ==========================================

HF_REPO = "nvidia/personaplex-7b-v1"

# Low VRAM Strategy:
# AUDIO_DEVICE: Use GPU for fast audio encoding/decoding (Mimi)
# LM_DEVICE: Use CPU for the heavy Language Model (Moshi)
AUDIO_DEVICE = "cuda"
LM_DEVICE = "cpu"

SAMPLE_RATE = 24000

# ==========================================
# DATA DEFINITIONS
# ==========================================

# Available voices in PersonaPlex
ALL_VOICES = [
    "NATF0", "NATF1", "NATF2", "NATF3",  # Natural Female
    "NATM0", "NATM1", "NATM2", "NATM3",  # Natural Male
    "VARF0", "VARF1", "VARF2", "VARF3", "VARF4",  # Variety Female
    "VARM0", "VARM1", "VARM2", "VARM3", "VARM4",  # Variety Male
]

# Example persona prompts from PersonaPlex paper
EXAMPLE_PERSONAS = [
    "You are a wise and friendly teacher. Answer questions or provide advice in a clear and engaging way.",
    "You enjoy having a good conversation.",
    "You work for CitySan Services which is a waste management company and your name is Ayelen Lucero.",
    "You enjoy having a good conversation. Have a technical discussion about fixing a reactor core on a spaceship to Mars. You are an astronaut on a Mars mission. Your name is Alex.",
]

# Import moshi after spaces to allow interception
from moshi.models import loaders, LMGen
from moshi.models.lm import load_audio, _iterate_audio, encode_from_sphn

# ==========================================
# MODEL LOADING
# ==========================================

print("Downloading model weights...")
MIMI_WEIGHT = hf_hub_download(HF_REPO, loaders.MIMI_NAME)
MOSHI_WEIGHT = hf_hub_download(HF_REPO, loaders.MOSHI_NAME)
TOKENIZER_PATH = hf_hub_download(HF_REPO, loaders.TEXT_TOKENIZER_NAME)
VOICES_TGZ = hf_hub_download(HF_REPO, "voices.tgz")

# Extract voices archive
VOICES_DIR = Path(VOICES_TGZ).parent / "voices"
if not VOICES_DIR.exists():
    print("Extracting voice embeddings...")
    with tarfile.open(VOICES_TGZ, "r:gz") as tar:
        tar.extractall(path=Path(VOICES_TGZ).parent)
print("Model weights ready.")

# Load text tokenizer (CPU only, no CUDA needed)
text_tokenizer = sentencepiece.SentencePieceProcessor(TOKENIZER_PATH)

# Global model cache - models loaded lazily inside @spaces.GPU
_model_cache = {}


def get_models():
    """Lazy load models with Low VRAM strategy (CPU Offload)."""
    global _model_cache
    if "initialized" not in _model_cache:
        print(f"Loading models with Low VRAM strategy (Audio: {AUDIO_DEVICE}, LM: {LM_DEVICE})...")

        # 1. Load Mimi encoder/decoder on GPU
        # We only load ONE instance of Mimi to save VRAM
        mimi = loaders.get_mimi(MIMI_WEIGHT, AUDIO_DEVICE)

        # 2. Load Moshi LM on CPU
        # FIX: Do not cast to .half(). Let the model remain in its native BFloat16 format.
        lm = loaders.get_moshi_lm(MOSHI_WEIGHT, device=LM_DEVICE)

        # Ensure it is on CPU
        if lm.device.type != "cpu":
             lm = lm.to("cpu")

        # Set to eval mode
        lm = lm.eval()

        # Create LMGen wrapper
        # We tell LMGen that the 'device' is CPU, so it creates buffers on CPU
        frame_size = int(mimi.sample_rate / mimi.frame_rate)
        lm_gen = LMGen(
            lm,
            audio_silence_frame_cnt=int(0.5 * mimi.frame_rate),
            sample_rate=mimi.sample_rate,
            device=LM_DEVICE, # Set to CPU
            frame_rate=mimi.frame_rate,
            temp=0.8,
            temp_text=0.7,
            top_k=250,
            top_k_text=25,
        )

        # Enable streaming mode
        mimi.streaming_forever(1)
        lm_gen.streaming_forever(1)

        # Run warmup
        print("Running warmup...")
        _warmup_models(mimi, lm_gen, frame_size)
        print("Warmup complete.")

        _model_cache.update({
            "mimi": mimi,
            "lm_gen": lm_gen,
            "frame_size": frame_size,
            "initialized": True,
        })
        print("Models loaded successfully.")

    return _model_cache


def _warmup_models(mimi, lm_gen, frame_size):
    """Run warmup passes with device transfers."""
    for _ in range(2):
        # 1. Create dummy audio on GPU
        chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=AUDIO_DEVICE)

        # 2. Encode on GPU
        codes = mimi.encode(chunk)

        # 3. Move codes to CPU for LM
        codes_cpu = codes.to(LM_DEVICE)

        # 4. LM Step on CPU
        for c in range(codes_cpu.shape[-1]):
            tokens = lm_gen.step(codes_cpu[:, :, c:c+1])

            if tokens is not None:
                # 5. Move tokens back to GPU for Decoding
                tokens_gpu = tokens.to(AUDIO_DEVICE)
                _ = mimi.decode(tokens_gpu[:, 1:9])

    torch.cuda.synchronize()
    # Reset after warmup
    mimi.reset_streaming()
    lm_gen.reset_streaming()


def wrap_with_system_tags(text: str) -> str:
    """Add system tags as PersonaPlex expects."""
    text = text.strip()
    if text.startswith("<system>") and text.endswith("<system>"):
        return text
    return f"<system> {text} <system>"


def decode_tokens_to_pcm(mimi, tokens: torch.Tensor) -> np.ndarray:
    """Decode audio tokens to PCM waveform."""
    # tokens shape: [B, num_codebooks, 1]
    # Ensure tokens are on GPU before passing to Mimi decode
    tokens = tokens.to(AUDIO_DEVICE)

    # Agent audio is in codebooks 1:9
    agent_audio_tokens = tokens[:, 1:9, :]
    pcm = mimi.decode(agent_audio_tokens)
    return pcm[0, 0].detach().cpu().numpy()


# ==========================================
# GENERATION LOGIC
# ==========================================

@spaces.GPU(duration=120)
def generate_response(audio_input, persona: str, voice: str):
    """Process audio input and generate PersonaPlex response."""
    if audio_input is None:
        return None, "Please record audio first."

    # Get models
    models = get_models()
    mimi = models["mimi"]
    lm_gen = models["lm_gen"]
    frame_size = models["frame_size"]

    # Process input audio
    sr, audio = audio_input
    audio = audio.astype(np.float32)

    # Convert to mono if stereo
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Normalize to [-1, 1]
    if audio.max() > 1.0 or audio.min() < -1.0:
        audio = audio / 32768.0 if audio.dtype == np.int16 else audio / np.abs(audio).max()

    # Resample to model's sample rate if needed
    if sr != mimi.sample_rate:
        import sphn
        audio = sphn.resample(audio, sr, mimi.sample_rate)

    # PREPEND SILENCE: Let model say its default greeting during this time (we'll discard this output)
    prepend_silence_duration = 2  # seconds
    prepend_silence = np.zeros(int(prepend_silence_duration * mimi.sample_rate), dtype=np.float32)

    # APPEND SILENCE: Give model time to complete its response after user finishes speaking
    append_silence_duration = 8  # seconds
    append_silence = np.zeros(int(append_silence_duration * mimi.sample_rate), dtype=np.float32)

    # Final audio: [prepend_silence] + [user_audio] + [append_silence]
    audio = np.concatenate([prepend_silence, audio, append_silence])

    # Calculate how many output frames to skip (corresponds to prepend silence)
    frames_to_skip = int(prepend_silence_duration * 12.5)

    # Add channel dimension: (T,) -> (1, T)
    if audio.ndim == 1:
        audio = audio[None, :]

    # Load voice prompt
    voice_path = str(VOICES_DIR / f"{voice}.pt")
    if not os.path.exists(voice_path):
        return None, f"Voice '{voice}' not found."

    # Note: load_voice_prompt_embeddings handles CPU/GPU internally if LM is on CPU
    lm_gen.load_voice_prompt_embeddings(voice_path)

    # Set text prompt
    if persona.strip():
        lm_gen.text_prompt_tokens = text_tokenizer.encode(wrap_with_system_tags(persona))
    else:
        lm_gen.text_prompt_tokens = None

    # Run system prompts (voice + text conditioning)
    with lm_gen.streaming(1):
        # Reset streaming state inside the context
        mimi.reset_streaming()
        lm_gen.reset_streaming()

        lm_gen.step_system_prompts(mimi)
        mimi.reset_streaming()

        # Process user audio frames
        generated_frames = []
        generated_text = []
        frame_count = 0  # Track frame index to skip prepend silence output

        # Create an iterator for the audio
        audio_iterator = encode_from_sphn(
            mimi,
            _iterate_audio(audio, sample_interval_size=frame_size, pad=True),
            max_batch=1,
        )

        for user_encoded in audio_iterator:
            # user_encoded comes from mimi, so it is on GPU.
            # We MUST move it to CPU for the LM
            user_encoded_cpu = user_encoded.to(LM_DEVICE)

            for c in range(user_encoded_cpu.shape[-1]):
                step_in = user_encoded_cpu[:, :, c:c+1]

                # LM runs on CPU
                tokens = lm_gen.step(step_in)
                frame_count += 1

                if tokens is None:
                    continue

                # Skip frames generated during prepend silence (model's default greeting)
                if frame_count <= frames_to_skip:
                    continue

                # Decode agent audio
                # tokens are on CPU, mimi.decode needs GPU.
                # decode_tokens_to_pcm handles the .to(AUDIO_DEVICE) transfer.
                pcm = decode_tokens_to_pcm(mimi, tokens)
                generated_frames.append(pcm)

                # Decode text token
                # tokens is on CPU, so we can access item() directly
                text_token = tokens[0, 0, 0].item()
                if text_token not in (0, 3):  # Skip special tokens
                    text_piece = text_tokenizer.id_to_piece(text_token).replace("▁", " ")
                    generated_text.append(text_piece)

    if not generated_frames:
        return None, "No audio generated. Try speaking more clearly."

    # Concatenate output audio
    output_audio = np.concatenate(generated_frames, axis=-1)
    output_text = "".join(generated_text).strip()

    return (mimi.sample_rate, output_audio), output_text


# ==========================================
# GRADIO INTERFACE
# ==========================================

with gr.Blocks(title="PersonaPlex Demo (Low VRAM)", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎭 PersonaPlex (Low VRAM Mode)

        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            persona = gr.Textbox(
                label="Persona Description",
                placeholder="Describe the assistant's persona...",
                value=EXAMPLE_PERSONAS[0],
                lines=4,
            )
            voice = gr.Dropdown(
                choices=ALL_VOICES,
                value="NATF2",
                label="Voice"
            )
            gr.Examples(
                examples=[[p] for p in EXAMPLE_PERSONAS],
                inputs=[persona],
                label="Example Personas"
            )

        with gr.Column(scale=2):
            audio_input = gr.Audio(
                label="🎀 Record your message",
                sources=["microphone", "upload"],
                type="numpy",
            )
            generate_btn = gr.Button("Generate Response", variant="primary", size="lg")

            audio_output = gr.Audio(
                label="πŸ”Š PersonaPlex Response",
                type="numpy",
                autoplay=True,
            )
            text_output = gr.Textbox(
                label="πŸ“ Response Text",
                interactive=False,
            )

    generate_btn.click(
        fn=generate_response,
        inputs=[audio_input, persona, voice],
        outputs=[audio_output, text_output],
    )


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        debug=True,
    )

awesome prompt

I see Blackwell (50xx) is not mentioned to be supported, is there anything preventing it to work on those cards?

For mass market GPUs with low memory, you can use the 4 bit quantized version available here : https://huggingface.co/brianmatzelle/personaplex-7b-v1-bnb-4bit.
I successfully tested it on a RTX 3080 ti with 12 GB of VRAM.

Sign up or log in to comment