import gradio as gr
import torch
import librosa
import numpy as np
import os
import traceback

# Define your PyTorch model class to match the conversion
class EmotionClassifier(torch.nn.Module):
    def __init__(self, input_features, hidden_sizes, num_classes):
        super().__init__()
        
        # Build the sequential model
        layers = []
        prev_size = input_features
        
        # Add hidden layers
        for size in hidden_sizes:
            layers.append(torch.nn.Linear(prev_size, size))
            layers.append(torch.nn.ReLU())
            prev_size = size
        
        # Add output layer
        layers.append(torch.nn.Linear(prev_size, num_classes))
        
        # Create the model
        self.model = torch.nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

# Define emotions list - make sure this matches your model's output classes
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"]  # Added "calm" as the 8th emotion based on your model

# Load the PyTorch model
try:
    print("Loading PyTorch model...")
    
    # Parameters determined from the Keras model
    input_features = 768  # From the Keras model's first layer weights
    hidden_sizes = [256, 128, 64]  # From the Keras model architecture
    num_classes = 8  # From the Keras model's output layer
    
    model = EmotionClassifier(input_features, hidden_sizes, num_classes)
    
    model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    traceback.print_exc()
    model = None

def extract_features(audio_path, sample_rate=16000):
    """Extract features from an audio file that match what your model expects"""
    try:
        print(f"Extracting features from {audio_path}")
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        
        # We need to extract features that match what your model was trained on
        # Based on your model, it seems to expect 768 features
        # Let's extract MFCCs, spectral features, and more to get a rich feature set
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        mfccs_var = np.var(mfccs.T, axis=0)
        
        # Extract spectral features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        chroma_mean = np.mean(chroma.T, axis=0)
        chroma_var = np.var(chroma.T, axis=0)
        
        # Extract mel spectrogram
        mel = librosa.feature.melspectrogram(y=audio, sr=sr)
        mel_mean = np.mean(mel.T, axis=0)
        mel_var = np.var(mel.T, axis=0)
        
        # Extract spectral contrast
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
        contrast_mean = np.mean(contrast.T, axis=0)
        contrast_var = np.var(contrast.T, axis=0)
        
        # Combine all features
        features = np.hstack([
            mfccs_mean, mfccs_var,
            chroma_mean, chroma_var,
            mel_mean[:200], mel_var[:200],  # Limit to 200 features to avoid exceeding 768
            contrast_mean, contrast_var
        ])
        
        # Ensure we have exactly 768 features
        if len(features) < 768:
            # Pad with zeros if needed
            features = np.pad(features, (0, 768 - len(features)))
        elif len(features) > 768:
            # Truncate if too many
            features = features[:768]
            
        print(f"Extracted {len(features)} features")
        return features
    except Exception as e:
        print(f"Error extracting features: {e}")
        traceback.print_exc()
        return None

def predict_emotion(audio):
    """Predict emotion from audio input"""
    if model is None:
        return {emotion: 1/len(emotions) for emotion in emotions}
    
    try:
        print(f"Processing audio input: {type(audio)}")
        
        # Process audio based on input type
        if isinstance(audio, str):  # File path
            features = extract_features(audio)
        else:  # Audio array from microphone
            # Save to a temporary file
            import tempfile
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
                if isinstance(audio, tuple):
                    audio_array, sample_rate = audio
                else:
                    audio_array = audio
                    sample_rate = 16000
                    
                import soundfile as sf
                sf.write(temp_file.name, audio_array, sample_rate)
                features = extract_features(temp_file.name)
                # Clean up
                os.remove(temp_file.name)
        
        if features is None:
            return {emotion: 1/len(emotions) for emotion in emotions}
        
        # Convert features to PyTorch tensor
        features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
        
        # Make prediction
        with torch.no_grad():
            outputs = model(features_tensor)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
        
        # Format result
        result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
        print(f"Prediction result: {result}")
        return result
    except Exception as e:
        print(f"Error in prediction: {e}")
        traceback.print_exc()
        return {emotion: 1/len(emotions) for emotion in emotions}

# Create Gradio interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs=gr.Label(num_top_classes=8),  # Updated to match the 8 emotions
    title="Speech Emotion Recognition",
    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
)

demo.launch()