import gradio as gr import torch import librosa import numpy as np import os import traceback # Define your PyTorch model class to match the conversion class EmotionClassifier(torch.nn.Module): def __init__(self, input_features, hidden_sizes, num_classes): super().__init__() # Build the sequential model layers = [] prev_size = input_features # Add hidden layers for size in hidden_sizes: layers.append(torch.nn.Linear(prev_size, size)) layers.append(torch.nn.ReLU()) prev_size = size # Add output layer layers.append(torch.nn.Linear(prev_size, num_classes)) # Create the model self.model = torch.nn.Sequential(*layers) def forward(self, x): return self.model(x) # Define emotions list - make sure this matches your model's output classes emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"] # Added "calm" as the 8th emotion based on your model # Load the PyTorch model try: print("Loading PyTorch model...") # Parameters determined from the Keras model input_features = 768 # From the Keras model's first layer weights hidden_sizes = [256, 128, 64] # From the Keras model architecture num_classes = 8 # From the Keras model's output layer model = EmotionClassifier(input_features, hidden_sizes, num_classes) model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt') model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() print("Model loaded successfully") except Exception as e: print(f"Error loading model: {e}") traceback.print_exc() model = None def extract_features(audio_path, sample_rate=16000): """Extract features from an audio file that match what your model expects""" try: print(f"Extracting features from {audio_path}") audio, sr = librosa.load(audio_path, sr=sample_rate) # We need to extract features that match what your model was trained on # Based on your model, it seems to expect 768 features # Let's extract MFCCs, spectral features, and more to get a rich feature set # Extract MFCCs mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20) mfccs_mean = np.mean(mfccs.T, axis=0) mfccs_var = np.var(mfccs.T, axis=0) # Extract spectral features chroma = librosa.feature.chroma_stft(y=audio, sr=sr) chroma_mean = np.mean(chroma.T, axis=0) chroma_var = np.var(chroma.T, axis=0) # Extract mel spectrogram mel = librosa.feature.melspectrogram(y=audio, sr=sr) mel_mean = np.mean(mel.T, axis=0) mel_var = np.var(mel.T, axis=0) # Extract spectral contrast contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) contrast_mean = np.mean(contrast.T, axis=0) contrast_var = np.var(contrast.T, axis=0) # Combine all features features = np.hstack([ mfccs_mean, mfccs_var, chroma_mean, chroma_var, mel_mean[:200], mel_var[:200], # Limit to 200 features to avoid exceeding 768 contrast_mean, contrast_var ]) # Ensure we have exactly 768 features if len(features) < 768: # Pad with zeros if needed features = np.pad(features, (0, 768 - len(features))) elif len(features) > 768: # Truncate if too many features = features[:768] print(f"Extracted {len(features)} features") return features except Exception as e: print(f"Error extracting features: {e}") traceback.print_exc() return None def predict_emotion(audio): """Predict emotion from audio input""" if model is None: return {emotion: 1/len(emotions) for emotion in emotions} try: print(f"Processing audio input: {type(audio)}") # Process audio based on input type if isinstance(audio, str): # File path features = extract_features(audio) else: # Audio array from microphone # Save to a temporary file import tempfile with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: if isinstance(audio, tuple): audio_array, sample_rate = audio else: audio_array = audio sample_rate = 16000 import soundfile as sf sf.write(temp_file.name, audio_array, sample_rate) features = extract_features(temp_file.name) # Clean up os.remove(temp_file.name) if features is None: return {emotion: 1/len(emotions) for emotion in emotions} # Convert features to PyTorch tensor features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0) # Make prediction with torch.no_grad(): outputs = model(features_tensor) probabilities = torch.nn.functional.softmax(outputs, dim=1) # Format result result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)} print(f"Prediction result: {result}") return result except Exception as e: print(f"Error in prediction: {e}") traceback.print_exc() return {emotion: 1/len(emotions) for emotion in emotions} # Create Gradio interface demo = gr.Interface( fn=predict_emotion, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs=gr.Label(num_top_classes=8), # Updated to match the 8 emotions title="Speech Emotion Recognition", description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions." ) demo.launch()