Spaces:

yl4579
/

DMOSpeech2-demo

Runtime error

App Files Files Community

yl4579 commited on Jul 21

Commit

1b8d1f0

verified ·

1 Parent(s): 2598aa3

Update app.py

Browse files

Files changed (1) hide show

app.py +279 -401

app.py CHANGED Viewed

@@ -2,471 +2,349 @@ import gradio as gr
 import torch
 import torchaudio
 import numpy as np
-from pathlib import Path
 import tempfile
-# Import the DMOInference class (assuming it's in a file called dmo_inference.py)
 from infer import DMOInference
-def initialize_model(student_checkpoint, duration_predictor_checkpoint, model_type, device, cuda_device_id):
-    """Initialize the DMOSpeech 2 model with given checkpoints."""
     try:
         model = DMOInference(
-            student_checkpoint_path=student_checkpoint,
-            duration_predictor_path=duration_predictor_checkpoint,
             device=device,
-            model_type=model_type,
-            tokenizer="pinyin",
-            dataset_name="Emilia_ZH_EN",
-            cuda_device_id=str(cuda_device_id)
         )
-        return model, "Model initialized successfully!"
     except Exception as e:
-        return None, f"Error initializing model: {str(e)}"
 def generate_speech(
-    model,
-    generation_mode,
     prompt_audio,
     prompt_text,
     target_text,
-    # Duration settings
-    duration_mode,
-    manual_duration,
-    dp_softmax_range,
-    dp_temperature,
-    # Teacher-student settings
-    teacher_steps,
-    teacher_stopping_time,
-    student_start_step,
     # Advanced settings
-    eta,
-    cfg_strength,
-    sway_coefficient,
-    # Teacher-guided specific
-    tg_switch_time,
-    tg_teacher_steps,
-    tg_student_steps
 ):
-    """Generate speech using the selected mode and parameters."""
-    if model is None:
-        return None, "Please initialize the model first!"
     if prompt_audio is None:
-        return None, "Please upload a reference audio!"
     if not target_text:
-        return None, "Please enter target text to generate!"
     try:
-        # Convert prompt_text to None if empty (for ASR)
-        prompt_text = prompt_text.strip() if prompt_text else None
-        # Determine duration
-        if duration_mode == "automatic":
-            duration = None
-        else:
-            duration = int(manual_duration)
-        # Generate based on selected mode
-        if generation_mode == "Student-Only (4 steps)":
-            # Standard DMOSpeech 2 generation
-            generated_wave = model.generate(
-                gen_text=target_text,
-                audio_path=prompt_audio,
-                prompt_text=prompt_text,
-                teacher_steps=0,  # No teacher guidance
-                student_start_step=1,
-                duration=duration,
-                dp_softmax_range=dp_softmax_range,
-                temperature=dp_temperature,
-                eta=eta,
-                cfg_strength=cfg_strength,
-                sway_coefficient=sway_coefficient,
-                verbose=True
-            )
-        elif generation_mode == "Teacher-Student Distillation":
-            # Full teacher-student distillation
-            generated_wave = model.generate(
-                gen_text=target_text,
-                audio_path=prompt_audio,
-                prompt_text=prompt_text,
-                teacher_steps=teacher_steps,
-                teacher_stopping_time=teacher_stopping_time,
-                student_start_step=student_start_step,
-                duration=duration,
-                dp_softmax_range=dp_softmax_range,
-                temperature=dp_temperature,
-                eta=eta,
-                cfg_strength=cfg_strength,
-                sway_coefficient=sway_coefficient,
-                verbose=True
-            )
-        elif generation_mode == "Teacher-Only":
-            # Teacher-only generation
-            generated_wave = model.generate_teacher_only(
-                gen_text=target_text,
-                audio_path=prompt_audio,
-                prompt_text=prompt_text,
-                teacher_steps=teacher_steps,
-                duration=duration,
-                eta=eta,
-                cfg_strength=cfg_strength,
-                sway_coefficient=sway_coefficient
-            )
-        elif generation_mode == "Teacher-Guided Sampling":
-            # Implement teacher-guided sampling
-            # This would require implementing the teacher-guided sampling algorithm
-            # For now, we'll use the regular generation with specific parameters
-            total_teacher_steps = tg_teacher_steps
-            generated_wave = model.generate(
-                gen_text=target_text,
-                audio_path=prompt_audio,
-                prompt_text=prompt_text,
-                teacher_steps=total_teacher_steps,
-                teacher_stopping_time=tg_switch_time,
-                student_start_step=1,
-                duration=duration,
-                dp_softmax_range=dp_softmax_range,
-                temperature=dp_temperature,
-                eta=eta,
-                cfg_strength=cfg_strength,
-                sway_coefficient=sway_coefficient,
-                verbose=True
-            )
-        # Save generated audio
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
-        # Convert to tensor and save
-        if isinstance(generated_wave, np.ndarray):
-            generated_wave = torch.from_numpy(generated_wave)
-        if generated_wave.dim() == 1:
-            generated_wave = generated_wave.unsqueeze(0)
-        torchaudio.save(output_path, generated_wave, 24000)
-        return output_path, "Speech generated successfully!"
-    except Exception as e:
-        return None, f"Error generating speech: {str(e)}"
-def predict_duration_only(
-    model,
-    prompt_audio,
-    prompt_text,
-    target_text,
-    dp_softmax_range,
-    dp_temperature
-):
-    """Predict duration for the target text."""
-    if model is None:
-        return "Please initialize the model first!"
-    if prompt_audio is None:
-        return "Please upload a reference audio!"
-    if not target_text:
-        return "Please enter target text!"
-    try:
-        prompt_text = prompt_text.strip() if prompt_text else None
-        predicted_duration = model.predict_duration(
-            pmt_wav_path=prompt_audio,
-            tar_text=target_text,
-            pmt_text=prompt_text,
-            dp_softmax_range=dp_softmax_range,
-            temperature=dp_temperature
-        )
-        return f"Predicted duration: {predicted_duration} frames (~{predicted_duration/100:.2f} seconds)"
     except Exception as e:
-        return f"Error predicting duration: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="DMOSpeech 2: Advanced Zero-Shot TTS") as demo:
-    gr.Markdown("""
-    # DMOSpeech 2: Reinforcement Learning for Duration Prediction in Metric-Optimized Speech Synthesis
-    This demo showcases DMOSpeech 2, which features:
-    - **Direct metric optimization** for speaker similarity and intelligibility
-    - **RL-optimized duration prediction** for better speech quality
-    - **Teacher-guided sampling** for improved diversity
-    - **Efficient 4-step generation** while maintaining high quality
-    """)
-    # Model state
-    model_state = gr.State(None)
-    with gr.Tab("Model Setup"):
-        gr.Markdown("### Initialize Model")
-        with gr.Row():
-            student_checkpoint = gr.Textbox(
-                label="Student Model Checkpoint Path",
-                placeholder="/path/to/student_checkpoint.pt"
             )
-            duration_checkpoint = gr.Textbox(
-                label="Duration Predictor Checkpoint Path",
-                placeholder="/path/to/duration_predictor.pt"
-            )
-        with gr.Row():
-            model_type = gr.Dropdown(
-                choices=["F5TTS_Base", "E2TTS_Base"],
-                value="F5TTS_Base",
-                label="Model Type"
             )
-            device = gr.Dropdown(
-                choices=["cuda", "cpu"],
-                value="cuda",
-                label="Device"
             )
-            cuda_device_id = gr.Number(
-                value=0,
-                label="CUDA Device ID",
-                precision=0
             )
-        init_button = gr.Button("Initialize Model", variant="primary")
-        init_status = gr.Textbox(label="Initialization Status", interactive=False)
-    with gr.Tab("Speech Generation"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### Input Settings")
-                prompt_audio = gr.Audio(
-                    label="Reference Audio",
-                    type="filepath",
-                    sources=["upload", "microphone"]
-                )
-                prompt_text = gr.Textbox(
-                    label="Reference Text (optional - will use ASR if empty)",
-                    placeholder="The text spoken in the reference audio..."
-                )
-                target_text = gr.Textbox(
-                    label="Target Text to Generate",
-                    placeholder="Enter the text you want to synthesize...",
-                    lines=3
-                )
-                generation_mode = gr.Radio(
-                    choices=[
-                        "Student-Only (4 steps)",
-                        "Teacher-Student Distillation",
-                        "Teacher-Only",
-                        "Teacher-Guided Sampling"
-                    ],
-                    value="Student-Only (4 steps)",
-                    label="Generation Mode"
-                )
-            with gr.Column(scale=1):
-                gr.Markdown("### Duration Settings")
-                duration_mode = gr.Radio(
-                    choices=["automatic", "manual"],
-                    value="automatic",
-                    label="Duration Mode"
-                )
-                manual_duration = gr.Slider(
-                    minimum=100,
-                    maximum=3000,
-                    value=500,
-                    step=10,
-                    label="Manual Duration (frames)",
-                    visible=False
-                )
-                dp_softmax_range = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.7,
-                    step=0.1,
-                    label="Duration Predictor Softmax Range"
-                )
-                dp_temperature = gr.Slider(
                     minimum=0.0,
                     maximum=2.0,
                     value=0.0,
                     step=0.1,
-                    label="Duration Predictor Temperature (0=argmax)"
-                )
-                predict_duration_btn = gr.Button("Predict Duration Only")
-                duration_output = gr.Textbox(label="Predicted Duration", interactive=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            with gr.Tab("Teacher-Student Settings"):
-                teacher_steps = gr.Slider(
-                    minimum=0,
-                    maximum=32,
-                    value=16,
-                    step=1,
-                    label="Teacher Steps"
-                )
-                teacher_stopping_time = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.07,
-                    step=0.01,
-                    label="Teacher Stopping Time"
-                )
-                student_start_step = gr.Slider(
-                    minimum=1,
-                    maximum=4,
-                    value=1,
-                    step=1,
-                    label="Student Start Step"
-                )
-            with gr.Tab("Sampling Settings"):
-                eta = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Eta (Stochasticity: 0=DDIM, 1=DDPM)"
-                )
-                cfg_strength = gr.Slider(
-                    minimum=0.0,
-                    maximum=5.0,
-                    value=2.0,
-                    step=0.1,
-                    label="CFG Strength"
-                )
-                sway_coefficient = gr.Slider(
-                    minimum=-2.0,
-                    maximum=2.0,
-                    value=-1.0,
-                    step=0.1,
-                    label="Sway Sampling Coefficient"
-                )
-            with gr.Tab("Teacher-Guided Settings"):
-                tg_switch_time = gr.Slider(
-                    minimum=0.1,
-                    maximum=0.5,
-                    value=0.25,
-                    step=0.05,
-                    label="Switch Time (when to transition to student)"
-                )
-                tg_teacher_steps = gr.Slider(
-                    minimum=6,
-                    maximum=20,
-                    value=14,
-                    step=1,
-                    label="Teacher Steps"
                 )
-                tg_student_steps = gr.Slider(
-                    minimum=1,
-                    maximum=4,
-                    value=2,
-                    step=1,
-                    label="Student Steps"
                 )
-        generate_button = gr.Button("Generate Speech", variant="primary")
-        with gr.Row():
-            output_audio = gr.Audio(label="Generated Speech", type="filepath")
-            generation_status = gr.Textbox(label="Generation Status", interactive=False)
-    with gr.Tab("Examples & Info"):
-        gr.Markdown("""
-        ### Usage Tips:
-        1. **Generation Modes:**
-           - **Student-Only (4 steps)**: Fastest, uses the distilled model with direct metric optimization
-           - **Teacher-Student Distillation**: Uses teacher guidance for initial steps
-           - **Teacher-Only**: Full quality but slower (32 steps)
-           - **Teacher-Guided Sampling**: Best balance of quality and diversity
-        2. **Duration Settings:**
-           - **Automatic**: Uses RL-optimized duration predictor
-           - **Manual**: Specify exact duration in frames (100 frames ≈ 1 second)
-        3. **Advanced Parameters:**
-           - **Eta**: Controls sampling stochasticity (0 = deterministic, 1 = fully stochastic)
-           - **CFG Strength**: Higher values = stronger adherence to text
-           - **Sway Coefficient**: Negative values focus on early denoising steps
-        ### Key Features:
-        - ✅ 5× faster than teacher model
-        - ✅ Better WER and speaker similarity
-        - ✅ RL-optimized duration prediction
-        - ✅ Maintains prosodic diversity with teacher-guided sampling
-        """)
-    # Event handlers
-    duration_mode.change(
-        lambda x: gr.update(visible=(x == "manual")),
-        inputs=[duration_mode],
-        outputs=[manual_duration]
-    )
-    init_button.click(
-        lambda sc, dc, mt, d, cid: initialize_model(sc, dc, mt, d, cid),
-        inputs=[student_checkpoint, duration_checkpoint, model_type, device, cuda_device_id],
-        outputs=[model_state, init_status]
-    )
-    generate_button.click(
         generate_speech,
         inputs=[
-            model_state,
-            generation_mode,
             prompt_audio,
             prompt_text,
             target_text,
-            duration_mode,
-            manual_duration,
-            dp_softmax_range,
-            dp_temperature,
-            teacher_steps,
-            teacher_stopping_time,
-            student_start_step,
-            eta,
-            cfg_strength,
-            sway_coefficient,
-            tg_switch_time,
-            tg_teacher_steps,
-            tg_student_steps
         ],
-        outputs=[output_audio, generation_status]
     )
-    predict_duration_btn.click(
-        predict_duration_only,
-        inputs=[
-            model_state,
-            prompt_audio,
-            prompt_text,
-            target_text,
-            dp_softmax_range,
-            dp_temperature
-        ],
-        outputs=[duration_output]
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import torch
 import torchaudio
 import numpy as np
 import tempfile
+import time
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+import os
+# Import the inference module (assuming it's named 'infer.py' based on the notebook)
 from infer import DMOInference
+# Global model instance
+model = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def download_models():
+    """Download models from HuggingFace Hub."""
+    try:
+        print("Downloading models from HuggingFace...")
+        # Download student model
+        student_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_85000.pt",
+            cache_dir="./models"
+        )
+        # Download duration predictor
+        duration_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_1500.pt",
+            cache_dir="./models"
+        )
+        print(f"Student model: {student_path}")
+        print(f"Duration model: {duration_path}")
+        return student_path, duration_path
+    except Exception as e:
+        print(f"Error downloading models: {e}")
+        return None, None
+def initialize_model():
+    """Initialize the model on startup."""
+    global model
     try:
+        # Download models
+        student_path, duration_path = download_models()
+        if not student_path or not duration_path:
+            return False, "Failed to download models from HuggingFace"
+        # Initialize model
         model = DMOInference(
+            student_checkpoint_path=student_path,
+            duration_predictor_path=duration_path,
             device=device,
+            model_type="F5TTS_Base"
         )
+        return True, f"Model loaded successfully on {device.upper()}"
     except Exception as e:
+        return False, f"Error initializing model: {str(e)}"
+# Initialize model on startup
+model_loaded, status_message = initialize_model()
 def generate_speech(
     prompt_audio,
     prompt_text,
     target_text,
+    mode,
     # Advanced settings
+    custom_teacher_steps,
+    custom_teacher_stopping_time,
+    custom_student_start_step,
+    temperature,
+    verbose
 ):
+    """Generate speech with different configurations."""
+    if not model_loaded or model is None:
+        return None, "Model not loaded! Please refresh the page.", "", ""
     if prompt_audio is None:
+        return None, "Please upload a reference audio!", "", ""
     if not target_text:
+        return None, "Please enter text to generate!", "", ""
     try:
+        start_time = time.time()
+        # Configure parameters based on mode
+        if mode == "Student Only (4 steps)":
+            teacher_steps = 0
+            student_start_step = 0
+            teacher_stopping_time = 1.0
+        elif mode == "Teacher-Guided (8 steps)":
+            # Default configuration from the notebook
+            teacher_steps = 16
+            teacher_stopping_time = 0.07
+            student_start_step = 1
+        elif mode == "High Diversity (16 steps)":
+            teacher_steps = 24
+            teacher_stopping_time = 0.3
+            student_start_step = 2
+        else:  # Custom
+            teacher_steps = custom_teacher_steps
+            teacher_stopping_time = custom_teacher_stopping_time
+            student_start_step = custom_student_start_step
+        # Generate speech
+        generated_audio = model.generate(
+            gen_text=target_text,
+            audio_path=prompt_audio,
+            prompt_text=prompt_text if prompt_text else None,
+            teacher_steps=teacher_steps,
+            teacher_stopping_time=teacher_stopping_time,
+            student_start_step=student_start_step,
+            temperature=temperature,
+            verbose=verbose
+        )
+        end_time = time.time()
+        # Calculate metrics
+        processing_time = end_time - start_time
+        audio_duration = generated_audio.shape[-1] / 24000
+        rtf = processing_time / audio_duration
+        # Save audio
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
+        if isinstance(generated_audio, np.ndarray):
+            generated_audio = torch.from_numpy(generated_audio)
+        if generated_audio.dim() == 1:
+            generated_audio = generated_audio.unsqueeze(0)
+        torchaudio.save(output_path, generated_audio, 24000)
+        # Format metrics
+        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
+        return output_path, "Success!", metrics, f"Mode: {mode}"
     except Exception as e:
+        return None, f"Error: {str(e)}", "", ""
 # Create Gradio interface
+with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"""
+    # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
+    Generate natural speech in any voice with just a short reference audio!
+    **Model Status:** {status_message} | **Device:** {device.upper()}
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Reference audio input
+            prompt_audio = gr.Audio(
+                label="📎 Reference Audio",
+                type="filepath",
+                sources=["upload", "microphone"]
             )
+            prompt_text = gr.Textbox(
+                label="📝 Reference Text (optional - will auto-transcribe if empty)",
+                placeholder="The text spoken in the reference audio...",
+                lines=2
             )
+            target_text = gr.Textbox(
+                label="✍️ Text to Generate",
+                placeholder="Enter the text you want to synthesize...",
+                lines=4
             )
+            # Generation mode
+            mode = gr.Radio(
+                choices=[
+                    "Student Only (4 steps)",
+                    "Teacher-Guided (8 steps)",
+                    "High Diversity (16 steps)",
+                    "Custom"
+                ],
+                value="Teacher-Guided (8 steps)",
+                label="🚀 Generation Mode",
+                info="Choose speed vs quality/diversity tradeoff"
             )
+            # Advanced settings (collapsible)
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                with gr.Row():
+                    custom_teacher_steps = gr.Slider(
+                        minimum=0,
+                        maximum=32,
+                        value=16,
+                        step=1,
+                        label="Teacher Steps",
+                        info="More steps = higher quality"
+                    )
+                    custom_teacher_stopping_time = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.07,
+                        step=0.01,
+                        label="Teacher Stopping Time",
+                        info="When to switch to student"
+                    )
+                    custom_student_start_step = gr.Slider(
+                        minimum=0,
+                        maximum=4,
+                        value=1,
+                        step=1,
+                        label="Student Start Step",
+                        info="Which student step to start from"
+                    )
+                temperature = gr.Slider(
                     minimum=0.0,
                     maximum=2.0,
                     value=0.0,
                     step=0.1,
+                    label="Duration Temperature",
+                    info="0 = deterministic, >0 = more variation in speech rhythm"
                 )
+                verbose = gr.Checkbox(
+                    value=False,
+                    label="Verbose Output",
+                    info="Show detailed generation steps"
                 )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Output
+            output_audio = gr.Audio(
+                label="🔊 Generated Speech",
+                type="filepath",
+                autoplay=True
+            )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+            metrics = gr.Textbox(
+                label="Performance Metrics",
+                interactive=False
+            )
+            info = gr.Textbox(
+                label="Generation Info",
+                interactive=False
+            )
+            # Tips
+            gr.Markdown("""
+            ### 💡 Quick Tips:
+            - **Student Only**: Fastest (4 steps), good quality
+            - **Teacher-Guided**: Best balance (8 steps), recommended
+            - **High Diversity**: More natural prosody (16 steps)
+            - **Temperature**: Add randomness to speech rhythm
+            ### 📊 Expected RTF (Real-Time Factor):
+            - Student Only: ~0.05x (20x faster than real-time)
+            - Teacher-Guided: ~0.10x (10x faster)
+            - High Diversity: ~0.20x (5x faster)
+            """)
+    # Examples section
+    gr.Markdown("### 🎯 Examples")
+    examples = [
+        [
+            None,  # Will be replaced with actual audio path
+            "Some call me nature, others call me mother nature.",
+            "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
+            "Teacher-Guided (8 steps)",
+            16, 0.07, 1, 0.0, False
+        ],
+        [
+            None,  # Will be replaced with actual audio path
+            "对，这就是我，万人敬仰的太乙真人。",
+            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
+            "Teacher-Guided (8 steps)",
+            16, 0.07, 1, 0.0, False
+        ],
+        [
+            None,
+            "对，这就是我，万人敬仰的太乙真人。",
+            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
+            "High Diversity (16 steps)",
+            24, 0.3, 2, 0.8, False
+        ]
+    ]
+    # Note about example audio files
+    gr.Markdown("""
+    *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
+    """)
+    # Event handler
+    generate_btn.click(
         generate_speech,
         inputs=[
             prompt_audio,
             prompt_text,
             target_text,
+            mode,
+            custom_teacher_steps,
+            custom_teacher_stopping_time,
+            custom_student_start_step,
+            temperature,
+            verbose
         ],
+        outputs=[output_audio, status, metrics, info]
     )
+    # Update visibility of custom settings based on mode
+    def update_custom_visibility(mode):
+        return gr.update(visible=(mode == "Custom"))
+    mode.change(
+        lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
+        inputs=[mode],
+        outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
     )
+# Launch the app
 if __name__ == "__main__":
+    if not model_loaded:
+        print(f"Warning: Model failed to load - {status_message}")
+    demo.launch()